Adding upstream version 124.0.1.upstream/124.0.1

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-04-19 00:47:55 +0000
commit: 26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree: f435a8308119effd964b339f76abb83a57c29483 /third_party/aom/aom_dsp
parent: Initial commit. (diff)
download: firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
271 files changed, 115943 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/aom_convolve.c b/third_party/aom/aom_dsp/aom_convolve.c
new file mode 100644
index 0000000000..254f6401c7
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_convolve.c
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horz_scalar_product(const uint8_t *a, const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static INLINE int vert_scalar_product(const uint8_t *a, ptrdiff_t a_stride,
+                                      const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *x_filters, int x0_q4,
+                           int x_step_q4, int w, int h) {
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *y_filters, int y0_q4,
+                          int y_step_q4, int w, int h) {
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static const InterpKernel *get_filter_base(const int16_t *filter) {
+  // NOTE: This assumes that the filter table is 256-byte aligned.
+  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
+}
+
+static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
+  return (int)((const InterpKernel *)(intptr_t)f - base);
+}
+
+void aom_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const int16_t *filter_x, int x_step_q4,
+                           const int16_t *filter_y, int y_step_q4, int w,
+                           int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4,
+                 w, h);
+}
+
+void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const int16_t *filter_x, int x_step_q4,
+                          const int16_t *filter_y, int y_step_q4, int w,
+                          int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4,
+                w, h);
+}
+
+void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  uint8_t temp[64 * 135];
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+                 filter, x0_q4, x_step_q4, w, intermediate_height);
+  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter,
+                y0_q4, y_step_q4, w, h);
+}
+
+void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                     ptrdiff_t dst_stride, const InterpKernel *filter,
+                     int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w,
+                     int h) {
+  aom_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                  y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, int w, int h) {
+  for (int r = h; r > 0; --r) {
+    memmove(dst, src, w);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int highbd_vert_scalar_product(const uint16_t *a,
+                                             ptrdiff_t a_stride,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k * a_stride] * b[k];
+  return sum;
+}
+
+static INLINE int highbd_horz_scalar_product(const uint16_t *a,
+                                             const int16_t *b) {
+  int sum = 0;
+  for (int k = 0; k < SUBPEL_TAPS; ++k) sum += a[k] * b[k];
+  return sum;
+}
+
+static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
+                                  uint8_t *dst8, ptrdiff_t dst_stride,
+                                  const InterpKernel *x_filters, int x0_q4,
+                                  int x_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (int y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (int x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      const int sum = highbd_horz_scalar_product(src_x, x_filter);
+      dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint8_t *src8, ptrdiff_t src_stride,
+                                 uint8_t *dst8, ptrdiff_t dst_stride,
+                                 const InterpKernel *y_filters, int y0_q4,
+                                 int y_step_q4, int w, int h, int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (int x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (int y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void aom_highbd_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                        x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const int16_t *filter_x, int x_step_q4,
+                                 const int16_t *filter_y, int y_step_q4, int w,
+                                 int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                       y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride,
+                                uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                int h) {
+  for (int y = 0; y < h; ++y) {
+    memmove(dst, src, w * sizeof(src[0]));
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/aom_dsp.cmake b/third_party/aom/aom_dsp/aom_dsp.cmake
new file mode 100644
index 0000000000..653f690741
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp.cmake
@@ -0,0 +1,510 @@
+#
+# Copyright (c) 2017, Alliance for Open Media. All rights reserved
+#
+# This source code is subject to the terms of the BSD 2 Clause License and the
+# Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License was
+# not distributed with this source code in the LICENSE file, you can obtain it
+# at www.aomedia.org/license/software. If the Alliance for Open Media Patent
+# License 1.0 was not distributed with this source code in the PATENTS file, you
+# can obtain it at www.aomedia.org/license/patent.
+#
+if(AOM_AOM_DSP_AOM_DSP_CMAKE_)
+  return()
+endif() # AOM_AOM_DSP_AOM_DSP_CMAKE_
+set(AOM_AOM_DSP_AOM_DSP_CMAKE_ 1)
+
+list(APPEND AOM_DSP_COMMON_SOURCES
+            "${AOM_ROOT}/aom_dsp/aom_convolve.c"
+            "${AOM_ROOT}/aom_dsp/aom_dsp_common.h"
+            "${AOM_ROOT}/aom_dsp/aom_filter.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd.h"
+            "${AOM_ROOT}/aom_dsp/aom_simd_inline.h"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitreader_buffer.h"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.c"
+            "${AOM_ROOT}/aom_dsp/bitwriter_buffer.h"
+            "${AOM_ROOT}/aom_dsp/blend.h"
+            "${AOM_ROOT}/aom_dsp/blend_a64_hmask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_mask.c"
+            "${AOM_ROOT}/aom_dsp/blend_a64_vmask.c"
+            "${AOM_ROOT}/aom_dsp/entcode.c"
+            "${AOM_ROOT}/aom_dsp/entcode.h"
+            "${AOM_ROOT}/aom_dsp/fft.c"
+            "${AOM_ROOT}/aom_dsp/fft_common.h"
+            "${AOM_ROOT}/aom_dsp/grain_params.h"
+            "${AOM_ROOT}/aom_dsp/intrapred.c"
+            "${AOM_ROOT}/aom_dsp/intrapred_common.h"
+            "${AOM_ROOT}/aom_dsp/loopfilter.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.c"
+            "${AOM_ROOT}/aom_dsp/odintrin.h"
+            "${AOM_ROOT}/aom_dsp/prob.h"
+            "${AOM_ROOT}/aom_dsp/recenter.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v128_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v256_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics.h"
+            "${AOM_ROOT}/aom_dsp/simd/v64_intrinsics_c.h"
+            "${AOM_ROOT}/aom_dsp/subtract.c"
+            "${AOM_ROOT}/aom_dsp/txfm_common.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_common_intrin.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm"
+            "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_x86.h"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c"
+            "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/mem_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/transpose_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.h"
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_sse2.h")
+
+list(APPEND AOM_DSP_COMMON_ASM_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_ssse3.asm"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_ssse3.h"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_ssse3.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_SSE4_1
+            "${AOM_ROOT}/aom_dsp/x86/blend_mask_sse4.h"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_hmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_vmask_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_sse4.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_utils.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+            "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/txfm_common_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/convolve_avx2.h"
+            "${AOM_ROOT}/aom_dsp/x86/fft_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/intrapred_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/loopfilter_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/blend_a64_mask_avx2.c"
+            "${AOM_ROOT}/aom_dsp/x86/bitdepth_conversion_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/convolve_2d_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/convolve_avx2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_AVX2.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/EbMemory_SSE4_1.h"
+            "${AOM_ROOT}/third_party/SVT-AV1/synonyms.h")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve_copy_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/fwd_txfm_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/loopfilter_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/intrapred_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/subtract_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/blend_a64_mask_neon.c"
+            "${AOM_ROOT}/aom_dsp/arm/avg_pred_neon.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_DOTPROD
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_dotprod.c")
+
+list(APPEND AOM_DSP_COMMON_INTRIN_NEON_I8MM
+            "${AOM_ROOT}/aom_dsp/arm/aom_convolve8_neon_i8mm.c")
+
+if(CONFIG_AV1_HIGHBITDEPTH)
+  list(APPEND AOM_DSP_COMMON_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_avx2.c")
+
+  list(APPEND AOM_DSP_COMMON_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_hmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_mask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_blend_a64_vmask_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c")
+endif()
+
+if(CONFIG_AV1_DECODER)
+  list(APPEND AOM_DSP_DECODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_reader.h"
+              "${AOM_ROOT}/aom_dsp/bitreader.c"
+              "${AOM_ROOT}/aom_dsp/bitreader.h" "${AOM_ROOT}/aom_dsp/entdec.c"
+              "${AOM_ROOT}/aom_dsp/entdec.h")
+endif()
+
+if(CONFIG_AV1_ENCODER)
+  list(APPEND AOM_DSP_ENCODER_SOURCES
+              "${AOM_ROOT}/aom_dsp/avg.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.c"
+              "${AOM_ROOT}/aom_dsp/binary_codes_writer.h"
+              "${AOM_ROOT}/aom_dsp/bitwriter.c"
+              "${AOM_ROOT}/aom_dsp/bitwriter.h"
+              "${AOM_ROOT}/aom_dsp/blk_sse_sum.c"
+              "${AOM_ROOT}/aom_dsp/entenc.c"
+              "${AOM_ROOT}/aom_dsp/entenc.h"
+              "${AOM_ROOT}/aom_dsp/fwd_txfm.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.c"
+              "${AOM_ROOT}/aom_dsp/grain_table.h"
+              "${AOM_ROOT}/aom_dsp/noise_model.c"
+              "${AOM_ROOT}/aom_dsp/noise_model.h"
+              "${AOM_ROOT}/aom_dsp/noise_util.c"
+              "${AOM_ROOT}/aom_dsp/noise_util.h"
+              "${AOM_ROOT}/aom_dsp/psnr.c"
+              "${AOM_ROOT}/aom_dsp/psnr.h"
+              "${AOM_ROOT}/aom_dsp/quantize.c"
+              "${AOM_ROOT}/aom_dsp/quantize.h"
+              "${AOM_ROOT}/aom_dsp/sad.c"
+              "${AOM_ROOT}/aom_dsp/sad_av1.c"
+              "${AOM_ROOT}/aom_dsp/sse.c"
+              "${AOM_ROOT}/aom_dsp/ssim.c"
+              "${AOM_ROOT}/aom_dsp/ssim.h"
+              "${AOM_ROOT}/aom_dsp/sum_squares.c"
+              "${AOM_ROOT}/aom_dsp/variance.c"
+              "${AOM_ROOT}/aom_dsp/variance.h")
+
+  # Flow estimation library
+  if(NOT CONFIG_REALTIME_ONLY)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/pyramid.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/corner_detect.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/corner_match.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/disflow.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/flow_estimation.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/ransac.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_sse4.c"
+                "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c")
+  endif()
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/ssim_sse2_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_impl_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_sse2.h"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c")
+
+  list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64
+              "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/subtract_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad4d_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/highbd_sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sad_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
+              "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
+              "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSSE3
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_sad4d_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.h"
+              "${AOM_ROOT}/aom_dsp/x86/masked_variance_intrin_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/variance_impl_ssse3.c"
+              "${AOM_ROOT}/aom_dsp/x86/jnt_variance_ssse3.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+              "${AOM_ROOT}/aom_dsp/x86/avg_intrin_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/sse_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+              "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+              "${AOM_ROOT}/aom_dsp/arm/sadxd_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/masked_sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/masked_sad4d_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/subpel_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/hadamard_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/avg_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/obmc_sad_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon.c"
+              "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_neon.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+              "${AOM_ROOT}/aom_dsp/arm/sad_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sadxd_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sse_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c"
+              "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c")
+
+  list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c"
+              "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c"
+              "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c")
+
+  if(CONFIG_AV1_HIGHBITDEPTH)
+    list(APPEND AOM_DSP_ENCODER_ASM_SSE2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_sad_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_impl_sse2.asm")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_adaptive_quantize_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_quantize_intrin_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_avx2.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SSE4_1
+                "${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse4.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON
+                "${AOM_ROOT}/aom_dsp/arm/highbd_avg_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_avg_pred_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_hadamard_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_masked_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_quantize_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sad_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sadxd_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sse_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_subpel_variance_neon.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c")
+
+    list(APPEND AOM_DSP_ENCODER_INTRIN_SVE
+                "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c"
+                "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c")
+  endif()
+
+  if(CONFIG_INTERNAL_STATS)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/fastssim.c"
+                "${AOM_ROOT}/aom_dsp/psnrhvs.c")
+  endif()
+
+  if(CONFIG_TUNE_VMAF)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/vmaf.c"
+                "${AOM_ROOT}/aom_dsp/vmaf.h")
+  endif()
+
+  if(CONFIG_TUNE_BUTTERAUGLI)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/butteraugli.c"
+                "${AOM_ROOT}/aom_dsp/butteraugli.h")
+  endif()
+
+  if(CONFIG_REALTIME_ONLY)
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_AVX2
+                     "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_avx2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_avx2.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_avx2.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE4_1
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_sad_sse4.c"
+                     "${AOM_ROOT}/aom_dsp/x86/obmc_variance_sse4.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_SSE2
+                     "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c")
+
+    list(REMOVE_ITEM AOM_DSP_ENCODER_INTRIN_NEON
+                     "${AOM_ROOT}/aom_dsp/arm/highbd_obmc_variance_neon.c"
+                     "${AOM_ROOT}/aom_dsp/arm/obmc_variance_neon.c")
+  endif()
+endif()
+
+# Creates aom_dsp build targets. Must not be called until after libaom target
+# has been created.
+function(setup_aom_dsp_targets)
+  add_library(aom_dsp_common OBJECT ${AOM_DSP_COMMON_SOURCES})
+  list(APPEND AOM_LIB_TARGETS aom_dsp_common)
+  create_no_op_source_file("aom_av1" "c" "no_op_source_file")
+  add_library(aom_dsp OBJECT "${no_op_source_file}")
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_common>)
+  endif()
+  list(APPEND AOM_LIB_TARGETS aom_dsp)
+
+  # Not all generators support libraries consisting only of object files. Add a
+  # source file to the aom_dsp target.
+  add_no_op_source_file_to_target("aom_dsp" "c")
+
+  if(CONFIG_AV1_DECODER)
+    add_library(aom_dsp_decoder OBJECT ${AOM_DSP_DECODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_decoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_decoder>)
+    endif()
+  endif()
+
+  if(CONFIG_AV1_ENCODER)
+    add_library(aom_dsp_encoder OBJECT ${AOM_DSP_ENCODER_SOURCES})
+    list(APPEND AOM_LIB_TARGETS aom_dsp_encoder)
+    target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    if(BUILD_SHARED_LIBS)
+      target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp_encoder>)
+    endif()
+    if(CONFIG_TUNE_VMAF)
+      target_include_directories(aom_dsp_encoder PRIVATE ${VMAF_INCLUDE_DIRS})
+    endif()
+  endif()
+
+  if(HAVE_SSE2)
+    add_asm_library("aom_dsp_common_sse2" "AOM_DSP_COMMON_ASM_SSE2")
+    add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE2")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        list(APPEND AOM_DSP_ENCODER_ASM_SSE2 ${AOM_DSP_ENCODER_ASM_SSE2_X86_64})
+      endif()
+      add_asm_library("aom_dsp_encoder_sse2" "AOM_DSP_ENCODER_ASM_SSE2")
+      add_intrinsics_object_library("-msse2" "sse2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE2")
+    endif()
+  endif()
+
+  if(HAVE_SSSE3)
+    add_asm_library("aom_dsp_common_ssse3" "AOM_DSP_COMMON_ASM_SSSE3")
+    add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSSE3")
+
+    if(CONFIG_AV1_ENCODER)
+      if("${AOM_TARGET_CPU}" STREQUAL "x86_64")
+        list(APPEND AOM_DSP_ENCODER_ASM_SSSE3
+                    ${AOM_DSP_ENCODER_ASM_SSSE3_X86_64})
+      endif()
+      add_asm_library("aom_dsp_encoder_ssse3" "AOM_DSP_ENCODER_ASM_SSSE3")
+      add_intrinsics_object_library("-mssse3" "ssse3" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSSE3")
+    endif()
+  endif()
+
+  if(HAVE_SSE4_1)
+    add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_SSE4_1")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-msse4.1" "sse4_1" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SSE4_1")
+    endif()
+  endif()
+
+  if(HAVE_AVX)
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx" "avx" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX")
+    endif()
+  endif()
+
+  if(HAVE_AVX2)
+    add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_AVX2")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("-mavx2" "avx2" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_AVX2")
+    endif()
+  endif()
+
+  if(HAVE_NEON)
+    add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                  "aom_dsp_common" "AOM_DSP_COMMON_INTRIN_NEON")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_INTRIN_FLAG}" "neon"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON")
+    endif()
+  endif()
+
+  if(HAVE_NEON_DOTPROD)
+    add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_DOTPROD")
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_NEON_DOTPROD_FLAG}" "neon_dotprod"
+                                    "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD")
+    endif()
+  endif()
+
+  if(HAVE_NEON_I8MM)
+    add_intrinsics_object_library("${AOM_NEON_I8MM_FLAG}" "neon_i8mm"
+                                  "aom_dsp_common"
+                                  "AOM_DSP_COMMON_INTRIN_NEON_I8MM")
+  endif()
+
+  if(HAVE_SVE)
+    if(CONFIG_AV1_ENCODER)
+      add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder"
+                                    "AOM_DSP_ENCODER_INTRIN_SVE")
+    endif()
+  endif()
+
+  target_sources(aom PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  if(BUILD_SHARED_LIBS)
+    target_sources(aom_static PRIVATE $<TARGET_OBJECTS:aom_dsp>)
+  endif()
+
+  # Pass the new lib targets up to the parent scope instance of
+  # $AOM_LIB_TARGETS.
+  set(AOM_LIB_TARGETS ${AOM_LIB_TARGETS} PARENT_SCOPE)
+endfunction()
diff --git a/third_party/aom/aom_dsp/aom_dsp_common.h b/third_party/aom/aom_dsp/aom_dsp_common.h
new file mode 100644
index 0000000000..85dc0052e2
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_common.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_DSP_COMMON_H_
+#define AOM_AOM_DSP_AOM_DSP_COMMON_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define PI 3.141592653589793238462643383279502884
+
+#define AOMMIN(x, y) (((x) < (y)) ? (x) : (y))
+#define AOMMAX(x, y) (((x) > (y)) ? (x) : (y))
+#define AOMSIGN(x) ((x) < 0 ? -1 : 0)
+
+#define NELEMENTS(x) (int)(sizeof(x) / sizeof(x[0]))
+
+#define IMPLIES(a, b) (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x) (((x) & ((x)-1)) == 0)
+
+/* Left shifting a negative value became undefined behavior in C99 (downgraded
+   from merely implementation-defined in C89). This should still compile to the
+   correct thing on any two's-complement machine, but avoid ubsan warnings.*/
+#define AOM_SIGNED_SHL(x, shift) ((x) * (((x)*0 + 1) << (shift)))
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+#define LIKELY(v) __builtin_expect(v, 1)
+#define UNLIKELY(v) __builtin_expect(v, 0)
+#else
+#define LIKELY(v) (v)
+#define UNLIKELY(v) (v)
+#endif
+
+typedef uint8_t qm_val_t;
+#define AOM_QM_BITS 5
+
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+
+static INLINE uint8_t clip_pixel(int val) {
+  return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+
+static INLINE int clamp(int value, int low, int high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE int64_t clamp64(int64_t value, int64_t low, int64_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE double fclamp(double value, double low, double high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE uint16_t clip_pixel_highbd(int val, int bd) {
+  switch (bd) {
+    case 8:
+    default: return (uint16_t)clamp(val, 0, 255);
+    case 10: return (uint16_t)clamp(val, 0, 1023);
+    case 12: return (uint16_t)clamp(val, 0, 4095);
+  }
+}
+
+// The result of this branchless code is equivalent to (value < 0 ? 0 : value)
+// or max(0, value) and might be faster in some cases.
+// Care should be taken since the behavior of right shifting signed type
+// negative value is undefined by C standards and implementation defined,
+static INLINE unsigned int negative_to_zero(int value) {
+  return value & ~(value >> (sizeof(value) * 8 - 1));
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_DSP_COMMON_H_
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd.c b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
new file mode 100644
index 0000000000..0265dd1ee5
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd.c
@@ -0,0 +1,18 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include "config/aom_config.h"
+
+#define RTCD_C
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/aom_once.h"
+
+void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); }
diff --git a/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
new file mode 100755
index 0000000000..4b49605e53
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -0,0 +1,1798 @@
+##
+## Copyright (c) 2017, Alliance for Open Media. All rights reserved
+##
+## This source code is subject to the terms of the BSD 2 Clause License and
+## the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+## was not distributed with this source code in the LICENSE file, you can
+## obtain it at www.aomedia.org/license/software. If the Alliance for Open
+## Media Patent License 1.0 was not distributed with this source code in the
+## PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+##
+sub aom_dsp_forward_decls() {
+print <<EOF
+/*
+ * DSP
+ */
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/blockd.h"
+#include "av1/common/enums.h"
+
+EOF
+}
+forward_decls qw/aom_dsp_forward_decls/;
+
+# optimizations which depend on multiple features
+$avx2_ssse3 = '';
+if ((aom_config("HAVE_AVX2") eq "yes") && (aom_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+}
+
+# functions that are 64 bit only.
+$mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = '';
+if ($opts{arch} eq "x86_64") {
+  $mmx_x86_64 = 'mmx';
+  $sse2_x86_64 = 'sse2';
+  $ssse3_x86_64 = 'ssse3';
+  $avx_x86_64 = 'avx';
+  $avx2_x86_64 = 'avx2';
+}
+
+@block_widths = (4, 8, 16, 32, 64, 128);
+
+@encoder_block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @encoder_block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w);
+  }
+}
+
+if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+  push @encoder_block_sizes, [4, 16];
+  push @encoder_block_sizes, [16, 4];
+  push @encoder_block_sizes, [8, 32];
+  push @encoder_block_sizes, [32, 8];
+  push @encoder_block_sizes, [16, 64];
+  push @encoder_block_sizes, [64, 16];
+}
+
+@tx_dims = (4, 8, 16, 32, 64);
+@tx_sizes = ();
+foreach $w (@tx_dims) {
+  push @tx_sizes, [$w, $w];
+  foreach $h (@tx_dims) {
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 2*$h || $h == 2*$w));
+    push @tx_sizes, [$w, $h] if ($w >=4 && $h >=4 && ($w == 4*$h || $h == 4*$w));
+  }
+}
+
+@pred_names = qw/dc dc_top dc_left dc_128 v h paeth smooth smooth_v smooth_h/;
+
+#
+# Intra prediction
+#
+
+foreach (@tx_sizes) {
+  ($w, $h) = @$_;
+  foreach $pred_name (@pred_names) {
+    add_proto "void", "aom_${pred_name}_predictor_${w}x${h}",
+              "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left";
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+        add_proto "void", "aom_highbd_${pred_name}_predictor_${w}x${h}",
+                  "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
+    }
+  }
+}
+
+specialize qw/aom_dc_top_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_top_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_top_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_dc_left_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_left_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_left_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_left_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_dc_128_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_128_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_128_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_v_predictor_4x4 neon sse2/;
+specialize qw/aom_v_predictor_4x8 neon sse2/;
+specialize qw/aom_v_predictor_4x16 neon sse2/;
+specialize qw/aom_v_predictor_8x4 neon sse2/;
+specialize qw/aom_v_predictor_8x8 neon sse2/;
+specialize qw/aom_v_predictor_8x16 neon sse2/;
+specialize qw/aom_v_predictor_8x32 neon sse2/;
+specialize qw/aom_v_predictor_16x4 neon sse2/;
+specialize qw/aom_v_predictor_16x8 neon sse2/;
+specialize qw/aom_v_predictor_16x16 neon sse2/;
+specialize qw/aom_v_predictor_16x32 neon sse2/;
+specialize qw/aom_v_predictor_16x64 neon sse2/;
+specialize qw/aom_v_predictor_32x8 neon sse2/;
+specialize qw/aom_v_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 neon sse2 avx2/;
+
+specialize qw/aom_h_predictor_4x4 neon sse2/;
+specialize qw/aom_h_predictor_4x8 neon sse2/;
+specialize qw/aom_h_predictor_4x16 neon sse2/;
+specialize qw/aom_h_predictor_8x4 neon sse2/;
+specialize qw/aom_h_predictor_8x8 neon sse2/;
+specialize qw/aom_h_predictor_8x16 neon sse2/;
+specialize qw/aom_h_predictor_8x32 neon sse2/;
+specialize qw/aom_h_predictor_16x4 neon sse2/;
+specialize qw/aom_h_predictor_16x8 neon sse2/;
+specialize qw/aom_h_predictor_16x16 neon sse2/;
+specialize qw/aom_h_predictor_16x32 neon sse2/;
+specialize qw/aom_h_predictor_16x64 neon sse2/;
+specialize qw/aom_h_predictor_32x8 neon sse2/;
+specialize qw/aom_h_predictor_32x16 neon sse2/;
+specialize qw/aom_h_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_h_predictor_32x64 neon sse2/;
+specialize qw/aom_h_predictor_64x16 neon sse2/;
+specialize qw/aom_h_predictor_64x32 neon sse2/;
+specialize qw/aom_h_predictor_64x64 neon sse2/;
+
+specialize qw/aom_paeth_predictor_4x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_4x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_4x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x16 ssse3 neon/;
+specialize qw/aom_paeth_predictor_8x32 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x4 ssse3 neon/;
+specialize qw/aom_paeth_predictor_16x8 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x8 ssse3 neon/;
+specialize qw/aom_paeth_predictor_32x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_32x64 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x16 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x32 ssse3 avx2 neon/;
+specialize qw/aom_paeth_predictor_64x64 ssse3 avx2 neon/;
+
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+
+specialize qw/aom_smooth_v_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_v_predictor_64x64 neon ssse3/;
+
+specialize qw/aom_smooth_h_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x16 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_h_predictor_64x64 neon ssse3/;
+
+# TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+# by multiply and shift.
+specialize qw/aom_dc_predictor_4x4 neon sse2/;
+specialize qw/aom_dc_predictor_4x8 neon sse2/;
+specialize qw/aom_dc_predictor_4x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x4 neon sse2/;
+specialize qw/aom_dc_predictor_8x8 neon sse2/;
+specialize qw/aom_dc_predictor_8x16 neon sse2/;
+specialize qw/aom_dc_predictor_8x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x4 neon sse2/;
+specialize qw/aom_dc_predictor_16x8 neon sse2/;
+specialize qw/aom_dc_predictor_16x16 neon sse2/;
+specialize qw/aom_dc_predictor_16x32 neon sse2/;
+specialize qw/aom_dc_predictor_16x64 neon sse2/;
+specialize qw/aom_dc_predictor_32x8 neon sse2/;
+specialize qw/aom_dc_predictor_32x16 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_32x64 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x64 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x32 neon sse2 avx2/;
+specialize qw/aom_dc_predictor_64x16 neon sse2 avx2/;
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  specialize qw/aom_highbd_v_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_4x16 neon/;
+  specialize qw/aom_highbd_v_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_8x32 neon/;
+  specialize qw/aom_highbd_v_predictor_16x4 neon/;
+  specialize qw/aom_highbd_v_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_16x64 neon/;
+  specialize qw/aom_highbd_v_predictor_32x8 neon/;
+  specialize qw/aom_highbd_v_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_v_predictor_32x64 neon/;
+  specialize qw/aom_highbd_v_predictor_64x16 neon/;
+  specialize qw/aom_highbd_v_predictor_64x32 neon/;
+  specialize qw/aom_highbd_v_predictor_64x64 neon/;
+
+  # TODO(yunqingwang): optimize rectangular DC_PRED to replace division
+  # by multiply and shift.
+  specialize qw/aom_highbd_dc_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_h_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_4x16 neon/;
+  specialize qw/aom_highbd_h_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_8x32 neon/;
+  specialize qw/aom_highbd_h_predictor_16x4 neon/;
+  specialize qw/aom_highbd_h_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_16x64 neon/;
+  specialize qw/aom_highbd_h_predictor_32x8 neon/;
+  specialize qw/aom_highbd_h_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_h_predictor_32x64 neon/;
+  specialize qw/aom_highbd_h_predictor_64x16 neon/;
+  specialize qw/aom_highbd_h_predictor_64x32 neon/;
+  specialize qw/aom_highbd_h_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_dc_128_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_128_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_dc_left_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_left_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_dc_top_predictor_4x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_4x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_4x16 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x4 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_8x32 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x4 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x8 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_16x64 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x8 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x16 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x32 sse2 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_32x64 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x16 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x32 neon/;
+  specialize qw/aom_highbd_dc_top_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_paeth_predictor_4x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_4x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_4x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_8x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x4 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_16x64 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x8 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_32x64 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x16 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x32 neon/;
+  specialize qw/aom_highbd_paeth_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_smooth_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x16 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_smooth_v_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x16 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_v_predictor_64x64 neon/;
+
+  specialize qw/aom_highbd_smooth_h_predictor_4x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_4x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_4x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_8x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x4 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_16x64 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x8 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_32x64 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x16 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x32 neon/;
+  specialize qw/aom_highbd_smooth_h_predictor_64x64 neon/;
+}
+#
+# Sub Pixel Filters
+#
+add_proto qw/void aom_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve_copy/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, int w, int h";
+add_proto qw/void aom_convolve8_horiz/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void aom_convolve8_vert/,            "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+specialize qw/aom_convolve_copy       neon                        sse2 avx2/;
+specialize qw/aom_convolve8_horiz     neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+specialize qw/aom_convolve8_vert      neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3";
+
+add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
+specialize qw/aom_scaled_2d ssse3 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_convolve_copy/, "const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, int w, int h";
+  specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/;
+
+  add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/;
+
+  add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd";
+  specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/;
+}
+
+#
+# Loopfilter
+#
+add_proto qw/void aom_lpf_vertical_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_14_quad avx2 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_8_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_4_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_14_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_14_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_14_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_6 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_6_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_8 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_8_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_8_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_8_quad sse2 avx2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_horizontal_4_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_horizontal_4_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_horizontal_4_quad sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
+specialize qw/aom_lpf_vertical_6_dual sse2 neon/;
+
+add_proto qw/void aom_lpf_vertical_6_quad/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0";
+specialize qw/aom_lpf_vertical_6_quad sse2 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_lpf_vertical_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_14_dual neon sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_8_dual neon sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_6_dual neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_vertical_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_vertical_4_dual neon sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_14/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_14_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1,int bd";
+  specialize qw/aom_highbd_lpf_horizontal_14_dual neon sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_6/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_6_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_6_dual neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_8_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_8_dual neon sse2 avx2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4/, "uint16_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4 neon sse2/;
+
+  add_proto qw/void aom_highbd_lpf_horizontal_4_dual/, "uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1, int bd";
+  specialize qw/aom_highbd_lpf_horizontal_4_dual neon sse2 avx2/;
+}
+
+#
+# Encoder functions.
+#
+
+#
+# Forward transform
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes"){
+    add_proto qw/void aom_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+    specialize qw/aom_fdct4x4 neon sse2/;
+
+    add_proto qw/void aom_fdct4x4_lp/, "const int16_t *input, int16_t *output, int stride";
+    specialize qw/aom_fdct4x4_lp neon sse2/;
+
+    if (aom_config("CONFIG_INTERNAL_STATS") eq "yes"){
+      # 8x8 DCT transform for psnr-hvs. Unlike other transforms isn't compatible
+      # with av1 scan orders, because it does two transposes.
+      add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+      specialize qw/aom_fdct8x8 neon sse2/, "$ssse3_x86_64";
+      # High bit depth
+      if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+        add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+        specialize qw/aom_highbd_fdct8x8 sse2/;
+      }
+    }
+    # FFT/IFFT (float) only used for denoising (and noise power spectral density estimation)
+    add_proto qw/void aom_fft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_fft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft4x4_float                  sse2/;
+
+    add_proto qw/void aom_fft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft8x8_float avx2             sse2/;
+
+    add_proto qw/void aom_fft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft16x16_float avx2           sse2/;
+
+    add_proto qw/void aom_fft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_fft32x32_float avx2           sse2/;
+
+    add_proto qw/void aom_ifft2x2_float/, "const float *input, float *temp, float *output";
+
+    add_proto qw/void aom_ifft4x4_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft4x4_float                 sse2/;
+
+    add_proto qw/void aom_ifft8x8_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft8x8_float avx2            sse2/;
+
+    add_proto qw/void aom_ifft16x16_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft16x16_float avx2          sse2/;
+
+    add_proto qw/void aom_ifft32x32_float/, "const float *input, float *temp, float *output";
+    specialize qw/aom_ifft32x32_float avx2          sse2/;
+}  # CONFIG_AV1_ENCODER
+
+#
+# Quantization
+#
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b sse2 neon avx avx2/, "$ssse3_x86_64";
+
+  add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_32x32 neon avx avx2/, "$ssse3_x86_64";
+
+  add_proto qw/void aom_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_quantize_b_64x64 neon ssse3 avx2/;
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_quantize_b_adaptive sse2 avx2/;
+
+    add_proto qw/void aom_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_quantize_b_32x32_adaptive sse2/;
+
+    add_proto qw/void aom_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_quantize_b_64x64_adaptive sse2/;
+  }
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes" && aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b sse2 avx2 neon/;
+
+  add_proto qw/void aom_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_32x32 sse2 avx2 neon/;
+
+  add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/aom_highbd_quantize_b_64x64 sse2 avx2 neon/;
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/void aom_highbd_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_highbd_quantize_b_adaptive sse2 avx2 neon/;
+
+    add_proto qw/void aom_highbd_quantize_b_32x32_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_highbd_quantize_b_32x32_adaptive sse2 avx2 neon/;
+
+    add_proto qw/void aom_highbd_quantize_b_64x64_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    specialize qw/aom_highbd_quantize_b_64x64_adaptive sse2 neon/;
+  }
+}  # CONFIG_AV1_ENCODER
+
+#
+# Alpha blending with mask
+#
+add_proto qw/void aom_lowbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params";
+specialize qw/aom_lowbd_blend_a64_d16_mask sse4_1 avx2 neon/;
+add_proto qw/void aom_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh";
+add_proto qw/void aom_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+add_proto qw/void aom_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h";
+specialize "aom_blend_a64_mask", qw/sse4_1 neon avx2/;
+specialize "aom_blend_a64_hmask", qw/sse4_1 neon/;
+specialize "aom_blend_a64_vmask", qw/sse4_1 neon/;
+
+if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void aom_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, int bd";
+  add_proto qw/void aom_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int w, int h, int bd";
+  add_proto qw/void aom_highbd_blend_a64_d16_mask/, "uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, ConvolveParams *conv_params, const int bd";
+  specialize "aom_highbd_blend_a64_mask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_hmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_vmask", qw/sse4_1 neon/;
+  specialize "aom_highbd_blend_a64_d16_mask", qw/sse4_1 neon avx2/;
+}
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+  #
+  # Block subtraction
+  #
+  add_proto qw/void aom_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+  specialize qw/aom_subtract_block neon sse2 avx2/;
+
+  add_proto qw/int64_t/, "aom_sse", "const uint8_t *a, int a_stride, const uint8_t *b,int b_stride, int width, int height";
+  specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/;
+
+  add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum";
+  specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+    specialize qw/aom_highbd_subtract_block sse2 neon/;
+
+    add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height";
+    specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/;
+  }
+
+  #
+  # Sum of Squares
+  #
+  add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height";
+  specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/;
+
+  add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N";
+  specialize qw/aom_sum_squares_i16 sse2 neon sve/;
+
+  add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/;
+
+  add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height";
+  specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/;
+
+  #
+  # Single block SAD / Single block Avg SAD
+  #
+  foreach (@encoder_block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "aom_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    add_proto qw/unsigned int/, "aom_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+  }
+
+  add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum";
+  specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/;
+  specialize qw/aom_sad128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16            sse2 neon/;
+  specialize qw/aom_sad8x8             sse2 neon/;
+  specialize qw/aom_sad8x4             sse2 neon/;
+  specialize qw/aom_sad4x8             sse2 neon/;
+  specialize qw/aom_sad4x4             sse2 neon/;
+
+  specialize qw/aom_sad4x16            sse2 neon/;
+  specialize qw/aom_sad16x4            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32            sse2 neon/;
+  specialize qw/aom_sad32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16           sse2 neon neon_dotprod/;
+
+  specialize qw/aom_sad_skip_128x128    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128     avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16      avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x16            sse2 neon/;
+  specialize qw/aom_sad_skip_8x8             sse2 neon/;
+  specialize qw/aom_sad_skip_8x4                  neon/;
+  specialize qw/aom_sad_skip_4x8             sse2 neon/;
+  specialize qw/aom_sad_skip_4x4                  neon/;
+
+  specialize qw/aom_sad_skip_4x16            sse2 neon/;
+  specialize qw/aom_sad_skip_16x4                 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x32            sse2 neon/;
+  specialize qw/aom_sad_skip_32x8            sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x64           sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16           sse2 neon neon_dotprod/;
+
+  specialize qw/aom_sad128x128_avg avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128_avg  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16_avg   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16_avg         sse2 neon/;
+  specialize qw/aom_sad8x8_avg          sse2 neon/;
+  specialize qw/aom_sad8x4_avg          sse2 neon/;
+  specialize qw/aom_sad4x8_avg          sse2 neon/;
+  specialize qw/aom_sad4x4_avg          sse2 neon/;
+
+  specialize qw/aom_sad4x16_avg         sse2 neon/;
+  specialize qw/aom_sad16x4_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32_avg         sse2 neon/;
+  specialize qw/aom_sad32x8_avg         sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64_avg        sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x16_avg        sse2 neon neon_dotprod/;
+
+  specialize qw/aom_dist_wtd_sad128x128_avg sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad128x64_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x128_avg  sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad64x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x64_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad32x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x32_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x16_avg   sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad16x8_avg    sse2 neon neon_dotprod/;
+  specialize qw/aom_dist_wtd_sad8x16_avg    sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad8x4_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x8_avg     sse2 neon/;
+  specialize qw/aom_dist_wtd_sad4x4_avg     sse2 neon/;
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    specialize qw/aom_dist_wtd_sad4x16_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad16x4_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad8x32_avg     sse2 neon/;
+    specialize qw/aom_dist_wtd_sad32x8_avg     sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad16x64_avg    sse2 neon neon_dotprod/;
+    specialize qw/aom_dist_wtd_sad64x16_avg    sse2 neon neon_dotprod/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad_skip_${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+      add_proto qw/unsigned int/, "aom_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "aom_highbd_sad${w}x${h}", qw/sse2/;
+        specialize "aom_highbd_sad${w}x${h}_avg", qw/sse2/;
+      }
+      add_proto qw/unsigned int/, "aom_highbd_dist_wtd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+    }
+    specialize qw/aom_highbd_sad128x128 avx2      neon/;
+    specialize qw/aom_highbd_sad128x64  avx2      neon/;
+    specialize qw/aom_highbd_sad64x128  avx2      neon/;
+    specialize qw/aom_highbd_sad64x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x16   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x16   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x8    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x16         sse2 neon/;
+    specialize qw/aom_highbd_sad8x8          sse2 neon/;
+    specialize qw/aom_highbd_sad8x4          sse2 neon/;
+    specialize qw/aom_highbd_sad4x8          sse2 neon/;
+    specialize qw/aom_highbd_sad4x4          sse2 neon/;
+
+    specialize qw/aom_highbd_sad4x16         sse2 neon/;
+    specialize qw/aom_highbd_sad16x4    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x32         sse2 neon/;
+    specialize qw/aom_highbd_sad32x8    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x16   avx2 sse2 neon/;
+
+    specialize qw/aom_highbd_sad_skip_128x128 avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_128x64  avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_64x128  avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_64x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_64x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x16   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x32   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x16   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x8    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x4              neon/;
+    specialize qw/aom_highbd_sad_skip_8x16         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_8x4               neon/;
+    specialize qw/aom_highbd_sad_skip_8x8          sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_4x8          sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_4x4               neon/;
+
+    specialize qw/aom_highbd_sad_skip_4x16         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_8x32         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x8    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x64   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_64x16   avx2 sse2 neon/;
+
+    specialize qw/aom_highbd_sad128x128_avg avx2      neon/;
+    specialize qw/aom_highbd_sad128x64_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x128_avg  avx2      neon/;
+    specialize qw/aom_highbd_sad64x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x32_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x16_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x16_avg              neon/;
+    specialize qw/aom_highbd_sad8x8_avg               neon/;
+    specialize qw/aom_highbd_sad8x4_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x8_avg          sse2 neon/;
+    specialize qw/aom_highbd_sad4x4_avg          sse2 neon/;
+
+    specialize qw/aom_highbd_sad4x16_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad8x32_avg         sse2 neon/;
+    specialize qw/aom_highbd_sad16x4_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x64_avg   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad32x8_avg    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x16_avg   avx2 sse2 neon/;
+  }
+  #
+  # Masked SAD
+  #
+  foreach (@encoder_block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sad${w}x${h}", "const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask";
+    specialize "aom_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_highbd_masked_sad${w}x${h}", "const uint8_t *src8, int src_stride, const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, int msk_stride, int invert_mask";
+      specialize "aom_highbd_masked_sad${w}x${h}", qw/ssse3 avx2 neon/;
+    }
+  }
+
+  #
+  # OBMC SAD
+  #
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+        specialize "aom_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
+      }
+    }
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach (@encoder_block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+        if (! (($w == 128 && $h == 32) || ($w == 32 && $h == 128))) {
+          specialize "aom_highbd_obmc_sad${w}x${h}", qw/sse4_1 avx2 neon/;
+        }
+      }
+    }
+  }
+
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@encoder_block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "aom_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+    add_proto qw/void/, "aom_masked_sad${w}x${h}x4d", "const uint8_t *src, int src_stride, const uint8_t *ref[4], int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned sads[4]";
+  }
+
+  specialize qw/aom_sad128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x4d    avx2 sse2 neon neon_dotprod/;
+
+  specialize qw/aom_sad8x16x4d         sse2 neon/;
+  specialize qw/aom_sad8x8x4d          sse2 neon/;
+  specialize qw/aom_sad8x4x4d          sse2 neon/;
+  specialize qw/aom_sad4x8x4d          sse2 neon/;
+  specialize qw/aom_sad4x4x4d          sse2 neon/;
+
+  specialize qw/aom_sad64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad8x32x4d         sse2 neon/;
+  specialize qw/aom_sad4x16x4d         sse2 neon/;
+
+  specialize qw/aom_sad_skip_128x128x4d avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_128x64x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x128x4d  avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_64x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon neon_dotprod/;
+
+  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon neon_dotprod/;
+  specialize qw/aom_sad_skip_16x4x4d              neon neon_dotprod/;
+  specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_8x4x4d               neon/;
+  specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x4x4d               neon/;
+
+  specialize qw/aom_sad128x128x3d avx2 neon neon_dotprod/;
+  specialize qw/aom_sad128x64x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x128x3d  avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad64x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x32x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad8x16x3d         neon/;
+  specialize qw/aom_sad8x8x3d          neon/;
+  specialize qw/aom_sad8x4x3d          neon/;
+  specialize qw/aom_sad4x8x3d          neon/;
+  specialize qw/aom_sad4x4x3d          neon/;
+
+  specialize qw/aom_sad64x16x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad32x8x3d    avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x64x3d   avx2 neon neon_dotprod/;
+  specialize qw/aom_sad16x4x3d         neon neon_dotprod/;
+  specialize qw/aom_sad8x32x3d         neon/;
+  specialize qw/aom_sad4x16x3d         neon/;
+
+  specialize qw/aom_masked_sad128x128x4d  ssse3 neon/;
+  specialize qw/aom_masked_sad128x64x4d   ssse3 neon/;
+  specialize qw/aom_masked_sad64x128x4d   ssse3 neon/;
+  specialize qw/aom_masked_sad64x64x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad64x32x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad32x64x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad32x32x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad32x16x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad16x64x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad16x32x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad16x16x4d    ssse3 neon/;
+  specialize qw/aom_masked_sad16x8x4d     ssse3 neon/;
+
+  specialize qw/aom_masked_sad8x16x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad8x8x4d      ssse3 neon/;
+  specialize qw/aom_masked_sad8x4x4d      ssse3 neon/;
+  specialize qw/aom_masked_sad4x16x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad4x8x4d      ssse3 neon/;
+  specialize qw/aom_masked_sad4x4x4d      ssse3 neon/;
+
+  specialize qw/aom_masked_sad4x16x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad16x4x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad8x32x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad32x8x4d     ssse3 neon/;
+  specialize qw/aom_masked_sad64x16x4d    ssse3 neon/;
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad${w}x${h}x3d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      add_proto qw/void/, "aom_highbd_sad_skip_${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[4], int ref_stride, uint32_t sad_array[4]";
+      if ($w != 128 && $h != 128) {
+        specialize "aom_highbd_sad${w}x${h}x4d", qw/sse2/;
+      }
+    }
+    specialize qw/aom_highbd_sad128x128x4d      avx2 neon/;
+    specialize qw/aom_highbd_sad128x64x4d       avx2 neon/;
+    specialize qw/aom_highbd_sad64x128x4d       avx2 neon/;
+    specialize qw/aom_highbd_sad64x64x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad64x32x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad32x64x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad32x32x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad32x16x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad16x32x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad16x16x4d   sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad16x8x4d    sse2 avx2 neon/;
+    specialize qw/aom_highbd_sad8x16x4d    sse2      neon/;
+    specialize qw/aom_highbd_sad8x8x4d     sse2      neon/;
+    specialize qw/aom_highbd_sad8x4x4d     sse2      neon/;
+    specialize qw/aom_highbd_sad4x8x4d     sse2      neon/;
+    specialize qw/aom_highbd_sad4x4x4d     sse2      neon/;
+
+    specialize qw/aom_highbd_sad4x16x4d         sse2 neon/;
+    specialize qw/aom_highbd_sad16x4x4d    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad8x32x4d         sse2 neon/;
+    specialize qw/aom_highbd_sad32x8x4d    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad16x64x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad64x16x4d   avx2 sse2 neon/;
+
+    specialize qw/aom_highbd_sad_skip_128x128x4d avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_128x64x4d  avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_64x128x4d  avx2      neon/;
+    specialize qw/aom_highbd_sad_skip_64x64x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_64x32x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x64x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x32x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x16x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x32x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x16x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x8x4d    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x4x4d              neon/;
+    specialize qw/aom_highbd_sad_skip_8x16x4d         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_8x8x4d          sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_8x4x4d               neon/;
+    specialize qw/aom_highbd_sad_skip_4x8x4d          sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_4x4x4d               neon/;
+
+    specialize qw/aom_highbd_sad_skip_4x16x4d         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_8x32x4d         sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_32x8x4d    avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_16x64x4d   avx2 sse2 neon/;
+    specialize qw/aom_highbd_sad_skip_64x16x4d   avx2 sse2 neon/;
+
+    specialize qw/aom_highbd_sad128x128x3d avx2 neon/;
+    specialize qw/aom_highbd_sad128x64x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x128x3d  avx2 neon/;
+    specialize qw/aom_highbd_sad64x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad64x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x32x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x16x3d         neon/;
+    specialize qw/aom_highbd_sad8x8x3d          neon/;
+    specialize qw/aom_highbd_sad8x4x3d          neon/;
+    specialize qw/aom_highbd_sad4x8x3d          neon/;
+    specialize qw/aom_highbd_sad4x4x3d          neon/;
+
+    specialize qw/aom_highbd_sad64x16x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad32x8x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad16x64x3d   avx2 neon/;
+    specialize qw/aom_highbd_sad16x4x3d    avx2 neon/;
+    specialize qw/aom_highbd_sad8x32x3d         neon/;
+    specialize qw/aom_highbd_sad4x16x3d         neon/;
+  }
+  #
+  # Avg
+  #
+  add_proto qw/unsigned int aom_avg_8x8/, "const uint8_t *, int p";
+  specialize qw/aom_avg_8x8 sse2 neon/;
+
+  add_proto qw/unsigned int aom_avg_4x4/, "const uint8_t *, int p";
+  specialize qw/aom_avg_4x4 sse2 neon/;
+
+  add_proto qw/void aom_avg_8x8_quad/, "const uint8_t *s, int p, int x16_idx, int y16_idx, int *avg";
+  specialize qw/aom_avg_8x8_quad avx2 sse2 neon/;
+
+  add_proto qw/void aom_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/aom_minmax_8x8 sse2 neon/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int aom_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_8x8 neon/;
+    add_proto qw/unsigned int aom_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/aom_highbd_avg_4x4 neon/;
+    add_proto qw/void aom_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+    specialize qw/aom_highbd_minmax_8x8 neon/;
+  }
+
+  add_proto qw/void aom_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_row avx2 sse2 neon/;
+
+  add_proto qw/void aom_int_pro_col/, "int16_t *vbuf, const uint8_t *ref, const int ref_stride, const int width, const int height, int norm_factor";
+  specialize qw/aom_int_pro_col avx2 sse2 neon/;
+
+  add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl";
+  specialize qw/aom_vector_var avx2 sse4_1 neon sve/;
+  # TODO(kyslov@) bring back SSE2 by extending it to 128 block size
+  #specialize qw/aom_vector_var neon sse2/;
+
+  #
+  # hamadard transform and satd for implmenting temporal dependency model
+  #
+  add_proto qw/void aom_hadamard_4x4/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_4x4 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_8x8 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_16x16 avx2 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+  specialize qw/aom_hadamard_32x32 avx2 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_lp_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_8x8 sse2 neon/;
+
+  add_proto qw/void aom_hadamard_lp_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_16x16 sse2 avx2 neon/;
+
+  add_proto qw/void aom_hadamard_lp_8x8_dual/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
+  specialize qw/aom_hadamard_lp_8x8_dual sse2 avx2 neon/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_8x8 avx2 neon/;
+
+    add_proto qw/void aom_highbd_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_16x16 avx2 neon/;
+
+    add_proto qw/void aom_highbd_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
+    specialize qw/aom_highbd_hadamard_32x32 avx2 neon/;
+  }
+  add_proto qw/int aom_satd/, "const tran_low_t *coeff, int length";
+  specialize qw/aom_satd neon sse2 avx2/;
+
+  add_proto qw/int aom_satd_lp/, "const int16_t *coeff, int length";
+  specialize qw/aom_satd_lp sse2 avx2 neon/;
+
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  add_proto qw/void aom_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/aom_ssim_parms_8x8/, "$sse2_x86_64";
+
+  if (aom_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void aom_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/aom_ssim_parms_16x16/, "$sse2_x86_64";
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  }
+}  # CONFIG_AV1_ENCODER
+
+if (aom_config("CONFIG_AV1_ENCODER") eq "yes") {
+
+  #
+  # Specialty Variance
+  #
+  add_proto qw/void aom_get_var_sse_sum_8x8_quad/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8";
+  specialize qw/aom_get_var_sse_sum_8x8_quad        avx2 sse2 neon neon_dotprod/;
+
+  add_proto qw/void aom_get_var_sse_sum_16x16_dual/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16";
+  specialize qw/aom_get_var_sse_sum_16x16_dual        avx2 sse2 neon neon_dotprod/;
+
+  add_proto qw/unsigned int aom_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int aom_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+  specialize qw/aom_mse16x16          sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_mse16x8           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x16           sse2      neon neon_dotprod/;
+  specialize qw/aom_mse8x8            sse2      neon neon_dotprod/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd (8, 10, 12) {
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+      specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/;
+      specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/;
+      specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/;
+      specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/;
+    }
+
+    specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/;
+    specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/;
+  }
+
+  #
+  #
+  #
+  add_proto qw/unsigned int aom_get_mb_ss/, "const int16_t *";
+  specialize qw/aom_get_mb_ss sse2 neon/;
+
+  #
+  # Variance / Subpixel Variance / Subpixel Avg Variance
+  #
+  add_proto qw/uint64_t/, "aom_mse_wxh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+  specialize qw/aom_mse_wxh_16bit  sse2 avx2 neon/;
+
+  add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
+  specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
+
+  foreach (@encoder_block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+    add_proto qw/uint32_t/, "aom_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+    add_proto qw/uint32_t/, "aom_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param";
+  }
+  specialize qw/aom_variance128x128   sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance128x64    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x128    sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance64x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x64     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance32x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x32     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x16     sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance16x8      sse2 avx2 neon neon_dotprod/;
+  specialize qw/aom_variance8x16      sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance8x4       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x8       sse2      neon neon_dotprod/;
+  specialize qw/aom_variance4x4       sse2      neon neon_dotprod/;
+
+  specialize qw/aom_sub_pixel_variance128x128   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance128x64    avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x128    avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x64     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance64x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x64     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance32x16     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x16           neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x8            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance8x4            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x8            neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance4x4            neon sse2 ssse3/;
+
+  specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance128x64  avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x128  avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x64   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance64x32   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x64   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x32   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance32x16   avx2 neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x32        neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x16        neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance16x8         neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x16         neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x8          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance8x4          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x8          neon sse2 ssse3/;
+  specialize qw/aom_sub_pixel_avg_variance4x4          neon sse2 ssse3/;
+
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    specialize qw/aom_variance4x16  neon neon_dotprod sse2/;
+    specialize qw/aom_variance16x4  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance8x32  neon neon_dotprod sse2/;
+    specialize qw/aom_variance32x8  neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/;
+    specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/;
+
+    specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/;
+    specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/;
+    specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/;
+
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x32  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x8  neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x64 neon ssse3/;
+    specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x16 neon ssse3/;
+  }
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x64 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance32x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x32 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x16 neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x8  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x16  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance8x4   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x8   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x4   neon ssse3/;
+
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x128  neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance128x64   neon ssse3/;
+  specialize qw/aom_dist_wtd_sub_pixel_avg_variance64x128   neon ssse3/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd (8, 10, 12) {
+      foreach (@encoder_block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+        add_proto qw/uint32_t/, "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS* jcp_param";
+      }
+    }
+
+    specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance128x64  sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance64x128  sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance64x64   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance64x32   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance32x64   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance32x32   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance32x16   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance16x32   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance16x16   sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance16x8    sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance8x16    sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance8x8     sse2 neon sve/;
+    specialize qw/aom_highbd_12_variance8x4          neon sve/;
+    specialize qw/aom_highbd_12_variance4x8          neon sve/;
+    specialize qw/aom_highbd_12_variance4x4   sse4_1 neon sve/;
+
+    specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance128x64  sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance64x128  sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance64x64   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance64x32   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance32x64   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance32x32   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance32x16   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance16x32   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance16x16   sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance16x8    sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance8x16    sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance8x8     sse2 avx2 neon sve/;
+    specialize qw/aom_highbd_10_variance8x4               neon sve/;
+    specialize qw/aom_highbd_10_variance4x8               neon sve/;
+    specialize qw/aom_highbd_10_variance4x4   sse4_1      neon sve/;
+
+    specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance128x64  sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance64x128  sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance64x64   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance64x32   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance32x64   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance32x32   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance32x16   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance16x32   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance16x16   sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance16x8    sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance8x16    sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance8x8     sse2 neon sve/;
+    specialize qw/aom_highbd_8_variance8x4          neon sve/;
+    specialize qw/aom_highbd_8_variance4x8          neon sve/;
+    specialize qw/aom_highbd_8_variance4x4   sse4_1 neon sve/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        my $avx2 = ($bd == 10) ? "avx2" : "";
+        specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/;
+        specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/;
+        specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/;
+        specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/;
+        specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/;
+        specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_variance128x128 sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance128x64  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x128  sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance64x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x64   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance32x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x32   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x16   sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance16x8    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x16    sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x8     sse2 avx2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance8x4     sse2      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x8               neon/;
+    specialize qw/aom_highbd_10_sub_pixel_variance4x4   sse4_1      neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_variance128x128 sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance128x64  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x128  sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_variance4x16" , qw/neon/;
+      }
+    }
+
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_12_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_10_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x128      neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance128x64       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x128       neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance64x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x64   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance32x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x32   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x16   sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance16x8    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x16    sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x8     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance8x4     sse2 neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x8          neon/;
+    specialize qw/aom_highbd_8_sub_pixel_avg_variance4x4   sse4_1 neon/;
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance64x16" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance32x8" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x64" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance16x4" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance8x32" , qw/sse2 neon/;
+        specialize "aom_highbd_${bd}_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
+
+    foreach $bd (8, 10, 12) {
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x128", qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance128x64" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x128" , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x64"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x32"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x16"  , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x8"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x16"   , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x4"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x8"    , qw/neon/;
+      specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x4"    , qw/neon/;
+    }
+
+    if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+      foreach $bd (8, 10, 12) {
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance64x16", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance32x8" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x64", qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance16x4" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance8x32" , qw/neon/;
+        specialize "aom_highbd_${bd}_dist_wtd_sub_pixel_avg_variance4x16" , qw/neon/;
+      }
+    }
+  }
+  #
+  # Masked Variance / Masked Subpixel Variance
+  #
+  foreach (@encoder_block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
+  }
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_8_", "_10_", "_12_") {
+      foreach (@encoder_block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
+        specialize "aom_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
+      }
+    }
+  }
+
+  #
+  # OBMC Variance / OBMC Subpixel Variance
+  #
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    foreach (@encoder_block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+      specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2 neon/;
+      specialize "aom_obmc_sub_pixel_variance${w}x${h}", qw/sse4_1 neon/;
+    }
+
+    if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+      foreach $bd ("_8_", "_10_", "_12_") {
+        foreach (@encoder_block_sizes) {
+          ($w, $h) = @$_;
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          add_proto qw/unsigned int/, "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+          specialize "aom_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1 neon/;
+          specialize "aom_highbd${bd}obmc_sub_pixel_variance${w}x${h}", qw/neon/;
+        }
+      }
+    }
+  }
+
+  #
+  # Comp Avg
+  #
+  add_proto qw/void aom_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+  specialize qw/aom_comp_avg_pred avx2 neon/;
+
+  add_proto qw/void aom_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+  specialize qw/aom_dist_wtd_comp_avg_pred ssse3 neon/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+    specialize qw/aom_highbd_comp_avg_pred neon/;
+
+    add_proto qw/void aom_highbd_dist_wtd_comp_avg_pred/, "uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const DIST_WTD_COMP_PARAMS *jcp_param";
+    specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/;
+
+    add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h";
+    specialize qw/aom_mse_wxh_16bit_highbd   sse2 avx2 neon sve/;
+  }
+
+  add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+  specialize qw/aom_comp_mask_pred ssse3 avx2 neon/;
+
+  if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void aom_highbd_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask";
+    specialize qw/aom_highbd_comp_mask_pred sse2 avx2 neon/;
+  }
+
+  # Flow estimation library
+  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
+    add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2";
+    specialize qw/av1_compute_cross_correlation sse4_1 avx2/;
+
+    add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v";
+    specialize qw/aom_compute_flow_at_point sse4_1 neon/;
+  }
+
+}  # CONFIG_AV1_ENCODER
+
+1;
diff --git a/third_party/aom/aom_dsp/aom_filter.h b/third_party/aom/aom_dsp/aom_filter.h
new file mode 100644
index 0000000000..00686ac388
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_filter.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_FILTER_H_
+#define AOM_AOM_DSP_AOM_FILTER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+
+#define SUBPEL_BITS 4
+#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1)
+#define SUBPEL_SHIFTS (1 << SUBPEL_BITS)
+#define SUBPEL_TAPS 8
+
+#define SCALE_SUBPEL_BITS 10
+#define SCALE_SUBPEL_SHIFTS (1 << SCALE_SUBPEL_BITS)
+#define SCALE_SUBPEL_MASK (SCALE_SUBPEL_SHIFTS - 1)
+#define SCALE_EXTRA_BITS (SCALE_SUBPEL_BITS - SUBPEL_BITS)
+#define SCALE_EXTRA_OFF ((1 << SCALE_EXTRA_BITS) / 2)
+
+#define RS_SUBPEL_BITS 6
+#define RS_SUBPEL_MASK ((1 << RS_SUBPEL_BITS) - 1)
+#define RS_SCALE_SUBPEL_BITS 14
+#define RS_SCALE_SUBPEL_MASK ((1 << RS_SCALE_SUBPEL_BITS) - 1)
+#define RS_SCALE_EXTRA_BITS (RS_SCALE_SUBPEL_BITS - RS_SUBPEL_BITS)
+#define RS_SCALE_EXTRA_OFF (1 << (RS_SCALE_EXTRA_BITS - 1))
+
+typedef int16_t InterpKernel[SUBPEL_TAPS];
+
+#define BIL_SUBPEL_BITS 3
+#define BIL_SUBPEL_SHIFTS (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_FILTER_H_
diff --git a/third_party/aom/aom_dsp/aom_simd.h b/third_party/aom/aom_dsp/aom_simd.h
new file mode 100644
index 0000000000..69da8f21b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd.h
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_H_
+#define AOM_AOM_DSP_AOM_SIMD_H_
+
+#include <stdint.h>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#endif
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_simd_inline.h"
+
+#define SIMD_CHECK 1  // Sanity checks in C equivalents
+
+// VS compiling for 32 bit targets does not support vector types in
+// structs as arguments, which makes the v256 type of the intrinsics
+// hard to support, so optimizations for this target are disabled.
+#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__))
+#include "simd/v256_intrinsics_x86.h"
+#else
+#include "simd/v256_intrinsics.h"
+#endif
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_H_
diff --git a/third_party/aom/aom_dsp/aom_simd_inline.h b/third_party/aom/aom_dsp/aom_simd_inline.h
new file mode 100644
index 0000000000..b4b1b35637
--- /dev/null
+++ b/third_party/aom/aom_dsp/aom_simd_inline.h
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+#define AOM_AOM_DSP_AOM_SIMD_INLINE_H_
+
+#include "aom/aom_integer.h"
+
+#ifndef SIMD_INLINE
+#define SIMD_INLINE static AOM_FORCE_INLINE
+#endif
+
+#define SIMD_CLAMP(value, min, max) \
+  ((value) > (max) ? (max) : (value) < (min) ? (min) : (value))
+
+#endif  // AOM_AOM_DSP_AOM_SIMD_INLINE_H_
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
new file mode 100644
index 0000000000..7441108b01
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
+                                    const int16x4_t s2, const int16x4_t s3,
+                                    const int16x4_t s4, const int16x4_t s5,
+                                    const int16x4_t s6, const int16x4_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+  int16x4_t sum;
+
+  sum = vmul_lane_s16(s0, filter_lo, 0);
+  sum = vmla_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmla_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmla_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmla_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmla_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqadd_s16(sum, vmul_lane_s16(s3, filter_lo, 3));
+  sum = vqadd_s16(sum, vmul_lane_s16(s4, filter_hi, 0));
+  return sum;
+}
+
+static INLINE uint8x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1,
+                                    const int16x8_t s2, const int16x8_t s3,
+                                    const int16x8_t s4, const int16x8_t s5,
+                                    const int16x8_t s6, const int16x8_t s7,
+                                    const int16x8_t filter) {
+  const int16x4_t filter_lo = vget_low_s16(filter);
+  const int16x4_t filter_hi = vget_high_s16(filter);
+  int16x8_t sum;
+
+  sum = vmulq_lane_s16(s0, filter_lo, 0);
+  sum = vmlaq_lane_s16(sum, s1, filter_lo, 1);
+  sum = vmlaq_lane_s16(sum, s2, filter_lo, 2);
+  sum = vmlaq_lane_s16(sum, s5, filter_hi, 1);
+  sum = vmlaq_lane_s16(sum, s6, filter_hi, 2);
+  sum = vmlaq_lane_s16(sum, s7, filter_hi, 3);
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s3, filter_lo, 3));
+  sum = vqaddq_s16(sum, vmulq_lane_s16(s4, filter_hi, 0));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
+                              uint8_t *dst, ptrdiff_t dst_stride,
+                              const int16_t *filter_x, int x_step_q4,
+                              const int16_t *filter_y, int y_step_q4, int w,
+                              int h) {
+  const int16x8_t filter = vld1q_s16(filter_x);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (h == 4) {
+    uint8x8_t t0, t1, t2, t3, d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+    transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+    src += 7;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      transpose_elems_inplace_u8_4x4(&d01, &d23);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01);
+      store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4;
+      dst += 4;
+      w -= 4;
+    } while (w != 0);
+  } else {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+
+    if (w == 4) {
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        load_u8_8x8(src + 7, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
+        transpose_elems_u8_4x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2,
+                               &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3);
+
+        store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0);
+        store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1);
+        store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2);
+        store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3);
+
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+    } else {
+      uint8x8_t d4, d5, d6, d7;
+      int16x8_t s11, s12, s13, s14;
+      int width;
+      const uint8_t *s;
+      uint8_t *d;
+
+      do {
+        load_u8_8x8(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+        width = w;
+        s = src + 7;
+        d = dst;
+
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+          transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                                         &t7);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+          d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+          d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+          d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+          d4 = convolve8_8(s4, s5, s6, s7, s8, s9, s10, s11, filter);
+          d5 = convolve8_8(s5, s6, s7, s8, s9, s10, s11, s12, filter);
+          d6 = convolve8_8(s6, s7, s8, s9, s10, s11, s12, s13, filter);
+          d7 = convolve8_8(s7, s8, s9, s10, s11, s12, s13, s14, filter);
+
+          transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                                         &d7);
+
+          store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);
+
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d += 8;
+          width -= 8;
+        } while (width != 0);
+        src += 8 * src_stride;
+        dst += 8 * dst_stride;
+        h -= 8;
+      } while (h > 0);
+    }
+  }
+}
+
+void aom_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  const int16x8_t filter = vld1q_s16(filter_y);
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d01, d23;
+    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+    s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+    s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+    s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+    s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4)));
+    s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5)));
+    s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6)));
+
+    src += 7 * src_stride;
+
+    do {
+      load_u8_8x4(src, src_stride, &t0, &t1, &t2, &t3);
+      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+      d0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+      d1 = convolve8_4(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+      d2 = convolve8_4(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+      d3 = convolve8_4(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, d0, d1, d2, d3;
+    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+    int height;
+    const uint8_t *s;
+    uint8_t *d;
+
+    do {
+      load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+
+      height = h;
+      s = src + 7 * src_stride;
+      d = dst;
+
+      do {
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+
+        d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter);
+        d1 = convolve8_8(s1, s2, s3, s4, s5, s6, s7, s8, filter);
+        d2 = convolve8_8(s2, s3, s4, s5, s6, s7, s8, s9, filter);
+        d3 = convolve8_8(s3, s4, s5, s6, s7, s8, s9, s10, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
new file mode 100644
index 0000000000..ac0a6efd00
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x2_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[2];
+  int32x4_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum = vdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
+                                         const int8x8_t filter,
+                                         const int32x4_t correction,
+                                         const uint8x16_t range_limit,
+                                         const uint8x16x3_t permute_tbl) {
+  int8x16_t clamped_samples, permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+  clamped_samples = vreinterpretq_s8_u8(vsubq_u8(samples, range_limit));
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_s8(clamped_samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_s8(clamped_samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_s8(clamped_samples, permute_tbl.val[2]);
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, permuted_samples[0], filter, 0);
+  sum0 = vdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, permuted_samples[1], filter, 0);
+  sum1 = vdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                      uint8_t *dst, ptrdiff_t dst_stride,
+                                      const int16_t *filter_x, int x_step_q4,
+                                      const int16_t *filter_y, int y_step_q4,
+                                      int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_x), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_sdot(s0, filter, correction, range_limit, perm_tbl);
+      t1 = convolve8_4_sdot(s1, filter, correction, range_limit, perm_tbl);
+      t2 = convolve8_4_sdot(s2, filter, correction, range_limit, perm_tbl);
+      t3 = convolve8_4_sdot(s3, filter, correction, range_limit, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_sdot(s0, filter, correction, range_limit, perm_tbl);
+        d1 = convolve8_8_sdot(s1, filter, correction, range_limit, perm_tbl);
+        d2 = convolve8_8_sdot(s2, filter, correction, range_limit, perm_tbl);
+        d3 = convolve8_8_sdot(s3, filter, correction, range_limit, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b = vqtbl2q_s8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(int8x8_t a0, int8x8_t a1, int8x8_t a2,
+                                        int8x8_t a3, int8x16_t *b0,
+                                        int8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  int8x16x2_t samples = { { vcombine_s8(a0, a1), vcombine_s8(a2, a3) } };
+  *b0 = vqtbl2q_s8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_s8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
+                                                 const int8x16_t samples_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  sum = vdotq_lane_s32(correction, samples_lo, filter, 0);
+  sum = vdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_sdot_partial(const int8x16_t samples0_lo,
+                                                 const int8x16_t samples0_hi,
+                                                 const int8x16_t samples1_lo,
+                                                 const int8x16_t samples1_hi,
+                                                 const int32x4_t correction,
+                                                 const int8x8_t filter) {
+  /* Sample range-clamping and permutation are performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Accumulate dot product into 'correction' to account for range clamp. */
+  /* First 4 output values. */
+  sum0 = vdotq_lane_s32(correction, samples0_lo, filter, 0);
+  sum0 = vdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vdotq_lane_s32(correction, samples1_lo, filter, 0);
+  sum1 = vdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_dotprod(const uint8_t *src, ptrdiff_t src_stride,
+                                     uint8_t *dst, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const int16x8_t correct_tmp = vmulq_n_s16(vld1q_s16(filter_y), 128);
+  const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp));
+  const uint8x8_t range_limit = vdup_n_u8(128);
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t t0, t1, t2, t3, t4, t5, t6;
+  int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  int8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+    src += 7 * src_stride;
+
+    /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+    s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+    s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+    s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+    s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+    s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+    s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+    s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+    s7 = vdup_n_s8(0);
+    s8 = vdup_n_s8(0);
+    s9 = vdup_n_s8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      uint8x8_t t7, t8, t9, t10;
+
+      load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10);
+
+      s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+      s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+      s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+      s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter);
+      d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter);
+      d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter);
+      d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6);
+      s += 7 * src_stride;
+
+      /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */
+      s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit));
+      s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit));
+      s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit));
+      s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit));
+      s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit));
+      s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit));
+      s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit));
+      s7 = vdup_n_s8(0);
+      s8 = vdup_n_s8(0);
+      s9 = vdup_n_s8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        uint8x8_t t7, t8, t9, t10;
+
+        load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10);
+
+        s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit));
+        s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit));
+        s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit));
+        s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit));
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                      correction, filter);
+        d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                      correction, filter);
+        d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                      correction, filter);
+        d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                      correction, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
new file mode 100644
index 0000000000..c314c0a192
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
+  0, 1, 2,  3,  1, 2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6,
+  4, 5, 6,  7,  5, 6,  7,  8,  6,  7,  8,  9,  7,  8,  9,  10,
+  8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_tran_concat_tbl[32]) = {
+  0, 8,  16, 24, 1, 9,  17, 25, 2, 10, 18, 26, 3, 11, 19, 27,
+  4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31
+};
+
+DECLARE_ALIGNED(16, static const uint8_t, dot_prod_merge_block_tbl[48]) = {
+  /* Shift left and insert new last column in transposed 4x4 block. */
+  1, 2, 3, 16, 5, 6, 7, 20, 9, 10, 11, 24, 13, 14, 15, 28,
+  /* Shift left and insert two new columns in transposed 4x4 block. */
+  2, 3, 16, 17, 6, 7, 20, 21, 10, 11, 24, 25, 14, 15, 28, 29,
+  /* Shift left and insert three new columns in transposed 4x4 block. */
+  3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30
+};
+
+static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x2_t permute_tbl) {
+  uint8x16_t permuted_samples[2];
+  int32x4_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples,
+                                          const int8x8_t filter,
+                                          const uint8x16x3_t permute_tbl) {
+  uint8x16_t permuted_samples[3];
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* Permute samples ready for dot product. */
+  /* { 0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6 } */
+  permuted_samples[0] = vqtbl1q_u8(samples, permute_tbl.val[0]);
+  /* { 4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10 } */
+  permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]);
+  /* { 8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } */
+  permuted_samples[2] = vqtbl1q_u8(samples, permute_tbl.val[2]);
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, permuted_samples[1], filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[1], filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, permuted_samples[2], filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_horiz_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_x));
+  uint8x16_t s0, s1, s2, s3;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)x_step_q4;
+  (void)filter_y;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1);
+
+  if (w == 4) {
+    const uint8x16x2_t perm_tbl = vld1q_u8_x2(dot_prod_permute_tbl);
+    do {
+      int16x4_t t0, t1, t2, t3;
+      uint8x8_t d01, d23;
+
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      t0 = convolve8_4_usdot(s0, filter, perm_tbl);
+      t1 = convolve8_4_usdot(s1, filter, perm_tbl);
+      t2 = convolve8_4_usdot(s2, filter, perm_tbl);
+      t3 = convolve8_4_usdot(s3, filter, perm_tbl);
+      d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    const uint8x16x3_t perm_tbl = vld1q_u8_x3(dot_prod_permute_tbl);
+    const uint8_t *s;
+    uint8_t *d;
+    int width;
+    uint8x8_t d0, d1, d2, d3;
+
+    do {
+      width = w;
+      s = src;
+      d = dst;
+      do {
+        load_u8_16x4(s, src_stride, &s0, &s1, &s2, &s3);
+
+        d0 = convolve8_8_usdot(s0, filter, perm_tbl);
+        d1 = convolve8_8_usdot(s1, filter, perm_tbl);
+        d2 = convolve8_8_usdot(s2, filter, perm_tbl);
+        d3 = convolve8_8_usdot(s3, filter, perm_tbl);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  }
+}
+
+static INLINE void transpose_concat_4x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b,
+                                        const uint8x16_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, XX, XX, XX, XX
+   * a1: 10, 11, 12, 13, XX, XX, XX, XX
+   * a2: 20, 21, 22, 23, XX, XX, XX, XX
+   * a3: 30, 31, 32, 33, XX, XX, XX, XX
+   *
+   * b: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b = vqtbl2q_u8(samples, permute_tbl);
+}
+
+static INLINE void transpose_concat_8x4(uint8x8_t a0, uint8x8_t a1,
+                                        uint8x8_t a2, uint8x8_t a3,
+                                        uint8x16_t *b0, uint8x16_t *b1,
+                                        const uint8x16x2_t permute_tbl) {
+  /* Transpose 8-bit elements and concatenate result rows as follows:
+   * a0: 00, 01, 02, 03, 04, 05, 06, 07
+   * a1: 10, 11, 12, 13, 14, 15, 16, 17
+   * a2: 20, 21, 22, 23, 24, 25, 26, 27
+   * a3: 30, 31, 32, 33, 34, 35, 36, 37
+   *
+   * b0: 00, 10, 20, 30, 01, 11, 21, 31, 02, 12, 22, 32, 03, 13, 23, 33
+   * b1: 04, 14, 24, 34, 05, 15, 25, 35, 06, 16, 26, 36, 07, 17, 27, 37
+   *
+   * The 'permute_tbl' is always 'dot_prod_tran_concat_tbl' above. Passing it
+   * as an argument is preferable to loading it directly from memory as this
+   * inline helper is called many times from the same parent function.
+   */
+
+  uint8x16x2_t samples = { { vcombine_u8(a0, a1), vcombine_u8(a2, a3) } };
+  *b0 = vqtbl2q_u8(samples, permute_tbl.val[0]);
+  *b1 = vqtbl2q_u8(samples, permute_tbl.val[1]);
+}
+
+static INLINE int16x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
+                                                  const uint8x16_t samples_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum;
+
+  sum = vusdotq_lane_s32(vdupq_n_s32(0), samples_lo, filter, 0);
+  sum = vusdotq_lane_s32(sum, samples_hi, filter, 1);
+
+  /* Further narrowing and packing is performed by the caller. */
+  return vqmovn_s32(sum);
+}
+
+static INLINE uint8x8_t convolve8_8_usdot_partial(const uint8x16_t samples0_lo,
+                                                  const uint8x16_t samples0_hi,
+                                                  const uint8x16_t samples1_lo,
+                                                  const uint8x16_t samples1_hi,
+                                                  const int8x8_t filter) {
+  /* Sample permutation is performed by the caller. */
+  int32x4_t sum0, sum1;
+  int16x8_t sum;
+
+  /* First 4 output values. */
+  sum0 = vusdotq_lane_s32(vdupq_n_s32(0), samples0_lo, filter, 0);
+  sum0 = vusdotq_lane_s32(sum0, samples0_hi, filter, 1);
+  /* Second 4 output values. */
+  sum1 = vusdotq_lane_s32(vdupq_n_s32(0), samples1_lo, filter, 0);
+  sum1 = vusdotq_lane_s32(sum1, samples1_hi, filter, 1);
+
+  /* Narrow and re-pack. */
+  sum = vcombine_s16(vqmovn_s32(sum0), vqmovn_s32(sum1));
+  return vqrshrun_n_s16(sum, FILTER_BITS);
+}
+
+void aom_convolve8_vert_neon_i8mm(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y));
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl);
+  uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
+  uint8x16x2_t samples_LUT;
+
+  assert((intptr_t)dst % 4 == 0);
+  assert(dst_stride % 4 == 0);
+
+  (void)filter_x;
+  (void)x_step_q4;
+  (void)y_step_q4;
+
+  src -= ((SUBPEL_TAPS / 2) - 1) * src_stride;
+
+  if (w == 4) {
+    const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910;
+    int16x4_t d0, d1, d2, d3;
+    uint8x8_t d01, d23;
+
+    load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src += 7 * src_stride;
+
+    s7 = vdup_n_u8(0);
+    s8 = vdup_n_u8(0);
+    s9 = vdup_n_u8(0);
+
+    /* This operation combines a conventional transpose and the sample permute
+     * (see horizontal case) required before computing the dot product.
+     */
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl);
+
+    do {
+      load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10);
+
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl);
+
+      /* Merge new data into block from previous iteration. */
+      samples_LUT.val[0] = s3456;
+      samples_LUT.val[1] = s78910;
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      d0 = convolve8_4_usdot_partial(s0123, s4567, filter);
+      d1 = convolve8_4_usdot_partial(s1234, s5678, filter);
+      d2 = convolve8_4_usdot_partial(s2345, s6789, filter);
+      d3 = convolve8_4_usdot_partial(s3456, s78910, filter);
+      d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      /* Prepare block for next iteration - re-using as much as possible. */
+      /* Shuffle everything up four rows. */
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl);
+    uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+        s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+        s6789_hi, s78910_lo, s78910_hi;
+    uint8x8_t d0, d1, d2, d3;
+    const uint8_t *s;
+    uint8_t *d;
+    int height;
+
+    do {
+      height = h;
+      s = src;
+      d = dst;
+
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      s7 = vdup_n_u8(0);
+      s8 = vdup_n_u8(0);
+      s9 = vdup_n_u8(0);
+
+      /* This operation combines a conventional transpose and the sample permute
+       * (see horizontal case) required before computing the dot product.
+       */
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi,
+                           tran_concat_tbl);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi,
+                           tran_concat_tbl);
+
+      do {
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi,
+                             tran_concat_tbl);
+
+        /* Merge new data into block from previous iteration. */
+        samples_LUT.val[0] = s3456_lo;
+        samples_LUT.val[1] = s78910_lo;
+        s4567_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        samples_LUT.val[0] = s3456_hi;
+        samples_LUT.val[1] = s78910_hi;
+        s4567_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+        d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi,
+                                       filter);
+        d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi,
+                                       filter);
+        d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi,
+                                       filter);
+        d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi,
+                                       filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        /* Prepare block for next iteration - re-using as much as possible. */
+        /* Shuffle everything up four rows. */
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
+
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src += 8;
+      dst += 8;
+      w -= 8;
+    } while (w != 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
new file mode 100644
index 0000000000..325d6f29ff
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/aom_convolve_copy_neon.c
@@ -0,0 +1,154 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <string.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  const uint8_t *src1;
+  uint8_t *dst1;
+  int y;
+
+  if (!(w & 0x0F)) {
+    for (y = 0; y < h; ++y) {
+      src1 = src;
+      dst1 = dst;
+      for (int x = 0; x < (w >> 4); ++x) {
+        vst1q_u8(dst1, vld1q_u8(src1));
+        src1 += 16;
+        dst1 += 16;
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x07)) {
+    for (y = 0; y < h; ++y) {
+      vst1_u8(dst, vld1_u8(src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x03)) {
+    for (y = 0; y < h; ++y) {
+      memcpy(dst, src, sizeof(uint32_t));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else if (!(w & 0x01)) {
+    for (y = 0; y < h; ++y) {
+      memcpy(dst, src, sizeof(uint16_t));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w < 8) {  // copy4
+    uint16x4_t s0, s1;
+    do {
+      s0 = vld1_u16(src);
+      src += src_stride;
+      s1 = vld1_u16(src);
+      src += src_stride;
+
+      vst1_u16(dst, s0);
+      dst += dst_stride;
+      vst1_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 8) {  // copy8
+    uint16x8_t s0, s1;
+    do {
+      s0 = vld1q_u16(src);
+      src += src_stride;
+      s1 = vld1q_u16(src);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      dst += dst_stride;
+      vst1q_u16(dst, s1);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w < 32) {  // copy16
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      src += src_stride;
+      s2 = vld1q_u16(src);
+      s3 = vld1q_u16(src + 8);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      dst += dst_stride;
+      vst1q_u16(dst, s2);
+      vst1q_u16(dst + 8, s3);
+      dst += dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 32) {  // copy32
+    uint16x8_t s0, s1, s2, s3;
+    do {
+      s0 = vld1q_u16(src);
+      s1 = vld1q_u16(src + 8);
+      s2 = vld1q_u16(src + 16);
+      s3 = vld1q_u16(src + 24);
+      src += src_stride;
+
+      vst1q_u16(dst, s0);
+      vst1q_u16(dst + 8, s1);
+      vst1q_u16(dst + 16, s2);
+      vst1q_u16(dst + 24, s3);
+      dst += dst_stride;
+    } while (--h != 0);
+  } else {  // copy64
+    uint16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+    do {
+      const uint16_t *s = src;
+      uint16_t *d = dst;
+      int width = w;
+      do {
+        s0 = vld1q_u16(s);
+        s1 = vld1q_u16(s + 8);
+        s2 = vld1q_u16(s + 16);
+        s3 = vld1q_u16(s + 24);
+        s4 = vld1q_u16(s + 32);
+        s5 = vld1q_u16(s + 40);
+        s6 = vld1q_u16(s + 48);
+        s7 = vld1q_u16(s + 56);
+
+        vst1q_u16(d, s0);
+        vst1q_u16(d + 8, s1);
+        vst1q_u16(d + 16, s2);
+        vst1q_u16(d + 24, s3);
+        vst1q_u16(d + 32, s4);
+        vst1q_u16(d + 40, s5);
+        vst1q_u16(d + 48, s6);
+        vst1q_u16(d + 56, s7);
+        s += 64;
+        d += 64;
+        width -= 64;
+      } while (width > 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  }
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/avg_neon.c b/third_party/aom/aom_dsp/arm/avg_neon.c
new file mode 100644
index 0000000000..2e79b2ef69
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_neon.c
@@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_avg_4x4_neon(const uint8_t *p, int stride) {
+  const uint8x8_t s0 = load_unaligned_u8(p, stride);
+  const uint8x8_t s1 = load_unaligned_u8(p + 2 * stride, stride);
+
+  const uint32_t sum = horizontal_add_u16x8(vaddl_u8(s0, s1));
+  return (sum + (1 << 3)) >> 4;
+}
+
+unsigned int aom_avg_8x8_neon(const uint8_t *p, int stride) {
+  uint8x8_t s0 = vld1_u8(p);
+  p += stride;
+  uint8x8_t s1 = vld1_u8(p);
+  p += stride;
+  uint16x8_t acc = vaddl_u8(s0, s1);
+
+  int i = 0;
+  do {
+    const uint8x8_t si = vld1_u8(p);
+    p += stride;
+    acc = vaddw_u8(acc, si);
+  } while (++i < 6);
+
+  const uint32_t sum = horizontal_add_u16x8(acc);
+  return (sum + (1 << 5)) >> 6;
+}
+
+void aom_avg_8x8_quad_neon(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  avg[0] = aom_avg_8x8_neon(s + y16_idx * p + x16_idx, p);
+  avg[1] = aom_avg_8x8_neon(s + y16_idx * p + (x16_idx + 8), p);
+  avg[2] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + x16_idx, p);
+  avg[3] = aom_avg_8x8_neon(s + (y16_idx + 8) * p + (x16_idx + 8), p);
+}
+
+int aom_satd_lp_neon(const int16_t *coeff, int length) {
+  int16x8_t s0 = vld1q_s16(coeff);
+  int16x8_t s1 = vld1q_s16(coeff + 8);
+
+  int16x8_t abs0 = vabsq_s16(s0);
+  int16x8_t abs1 = vabsq_s16(s1);
+
+  int32x4_t acc0 = vpaddlq_s16(abs0);
+  int32x4_t acc1 = vpaddlq_s16(abs1);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s16(coeff);
+    s1 = vld1q_s16(coeff + 8);
+
+    abs0 = vabsq_s16(s0);
+    abs1 = vabsq_s16(s1);
+
+    acc0 = vpadalq_s16(acc0, abs0);
+    acc1 = vpadalq_s16(acc1, abs1);
+
+    length -= 16;
+    coeff += 16;
+  }
+
+  int32x4_t accum = vaddq_s32(acc0, acc1);
+  return horizontal_add_s32x4(accum);
+}
+
+void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  assert(width % 16 == 0);
+  assert(height % 4 == 0);
+
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+  uint16x8_t sum_lo[2], sum_hi[2];
+
+  int w = 0;
+  do {
+    const uint8_t *r = ref + w;
+    uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride);
+    uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride);
+    uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride);
+    uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride);
+
+    sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+    r += 4 * ref_stride;
+
+    for (int h = height - 4; h != 0; h -= 4) {
+      r0 = vld1q_u8(r + 0 * ref_stride);
+      r1 = vld1q_u8(r + 1 * ref_stride);
+      r2 = vld1q_u8(r + 2 * ref_stride);
+      r3 = vld1q_u8(r + 3 * ref_stride);
+
+      uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+      uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+      uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+      uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+      sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo);
+      sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi);
+      sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo);
+      sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi);
+
+      r += 4 * ref_stride;
+    }
+
+    sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+    const int16x8_t avg0 =
+        vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+    const int16x8_t avg1 =
+        vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+    vst1q_s16(hbuf + w, avg0);
+    vst1q_s16(hbuf + w + 8, avg1);
+    w += 16;
+  } while (w < width);
+}
+
+void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  assert(width % 16 == 0);
+  assert(height % 4 == 0);
+
+  const int16x4_t neg_norm_factor = vdup_n_s16(-norm_factor);
+  uint16x8_t sum[4];
+
+  int h = 0;
+  do {
+    sum[0] = vpaddlq_u8(vld1q_u8(ref + 0 * ref_stride));
+    sum[1] = vpaddlq_u8(vld1q_u8(ref + 1 * ref_stride));
+    sum[2] = vpaddlq_u8(vld1q_u8(ref + 2 * ref_stride));
+    sum[3] = vpaddlq_u8(vld1q_u8(ref + 3 * ref_stride));
+
+    for (int w = 16; w < width; w += 16) {
+      sum[0] = vpadalq_u8(sum[0], vld1q_u8(ref + 0 * ref_stride + w));
+      sum[1] = vpadalq_u8(sum[1], vld1q_u8(ref + 1 * ref_stride + w));
+      sum[2] = vpadalq_u8(sum[2], vld1q_u8(ref + 2 * ref_stride + w));
+      sum[3] = vpadalq_u8(sum[3], vld1q_u8(ref + 3 * ref_stride + w));
+    }
+
+    uint16x4_t sum_4d = vmovn_u32(horizontal_add_4d_u16x8(sum));
+    int16x4_t avg = vshl_s16(vreinterpret_s16_u16(sum_4d), neg_norm_factor);
+    vst1_s16(vbuf + h, avg);
+
+    ref += 4 * ref_stride;
+    h += 4;
+  } while (h < height);
+}
+
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
+int aom_satd_neon(const tran_low_t *coeff, int length) {
+  const int32x4_t zero = vdupq_n_s32(0);
+
+  int32x4_t s0 = vld1q_s32(&coeff[0]);
+  int32x4_t s1 = vld1q_s32(&coeff[4]);
+  int32x4_t s2 = vld1q_s32(&coeff[8]);
+  int32x4_t s3 = vld1q_s32(&coeff[12]);
+
+  int32x4_t accum0 = vabsq_s32(s0);
+  int32x4_t accum1 = vabsq_s32(s2);
+  accum0 = vabaq_s32(accum0, s1, zero);
+  accum1 = vabaq_s32(accum1, s3, zero);
+
+  length -= 16;
+  coeff += 16;
+
+  while (length != 0) {
+    s0 = vld1q_s32(&coeff[0]);
+    s1 = vld1q_s32(&coeff[4]);
+    s2 = vld1q_s32(&coeff[8]);
+    s3 = vld1q_s32(&coeff[12]);
+
+    accum0 = vabaq_s32(accum0, s0, zero);
+    accum1 = vabaq_s32(accum1, s1, zero);
+    accum0 = vabaq_s32(accum0, s2, zero);
+    accum1 = vabaq_s32(accum1, s3, zero);
+
+    length -= 16;
+    coeff += 16;
+  }
+
+  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+  return horizontal_add_s32x4(vaddq_s32(accum0, accum1));
+}
+
+int aom_vector_var_neon(const int16_t *ref, const int16_t *src, int bwl) {
+  assert(bwl >= 2 && bwl <= 5);
+  int width = 4 << bwl;
+
+  int16x8_t r = vld1q_s16(ref);
+  int16x8_t s = vld1q_s16(src);
+
+  // diff: dynamic range [-510, 510] 10 (signed) bits.
+  int16x8_t diff = vsubq_s16(r, s);
+  // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+  int16x8_t v_mean = diff;
+  // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+  int32x4_t v_sse[2];
+  v_sse[0] = vmull_s16(vget_low_s16(diff), vget_low_s16(diff));
+  v_sse[1] = vmull_s16(vget_high_s16(diff), vget_high_s16(diff));
+
+  ref += 8;
+  src += 8;
+  width -= 8;
+
+  do {
+    r = vld1q_s16(ref);
+    s = vld1q_s16(src);
+
+    diff = vsubq_s16(r, s);
+    v_mean = vaddq_s16(v_mean, diff);
+
+    v_sse[0] = vmlal_s16(v_sse[0], vget_low_s16(diff), vget_low_s16(diff));
+    v_sse[1] = vmlal_s16(v_sse[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    ref += 8;
+    src += 8;
+    width -= 8;
+  } while (width != 0);
+
+  // Dynamic range [0, 65280], 16 (unsigned) bits.
+  const uint32_t mean_abs = abs(horizontal_add_s16x8(v_mean));
+  const int32_t sse = horizontal_add_s32x4(vaddq_s32(v_sse[0], v_sse[1]));
+
+  // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+  return sse - ((mean_abs * mean_abs) >> (bwl + 2));
+}
+
+void aom_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
+                         int b_stride, int *min, int *max) {
+  // Load and concatenate.
+  const uint8x16_t a01 = load_u8_8x2(a + 0 * a_stride, a_stride);
+  const uint8x16_t a23 = load_u8_8x2(a + 2 * a_stride, a_stride);
+  const uint8x16_t a45 = load_u8_8x2(a + 4 * a_stride, a_stride);
+  const uint8x16_t a67 = load_u8_8x2(a + 6 * a_stride, a_stride);
+
+  const uint8x16_t b01 = load_u8_8x2(b + 0 * b_stride, b_stride);
+  const uint8x16_t b23 = load_u8_8x2(b + 2 * b_stride, b_stride);
+  const uint8x16_t b45 = load_u8_8x2(b + 4 * b_stride, b_stride);
+  const uint8x16_t b67 = load_u8_8x2(b + 6 * b_stride, b_stride);
+
+  // Absolute difference.
+  const uint8x16_t ab01_diff = vabdq_u8(a01, b01);
+  const uint8x16_t ab23_diff = vabdq_u8(a23, b23);
+  const uint8x16_t ab45_diff = vabdq_u8(a45, b45);
+  const uint8x16_t ab67_diff = vabdq_u8(a67, b67);
+
+  // Max values between the Q vectors.
+  const uint8x16_t ab0123_max = vmaxq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_max = vmaxq_u8(ab45_diff, ab67_diff);
+  const uint8x16_t ab0123_min = vminq_u8(ab01_diff, ab23_diff);
+  const uint8x16_t ab4567_min = vminq_u8(ab45_diff, ab67_diff);
+
+  const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
+  const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
+
+#if AOM_ARCH_AARCH64
+  *min = *max = 0;  // Clear high bits
+  *((uint8_t *)max) = vmaxvq_u8(ab07_max);
+  *((uint8_t *)min) = vminvq_u8(ab07_min);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint8x8_t ab_max = vmax_u8(vget_high_u8(ab07_max), vget_low_u8(ab07_max));
+  uint8x8_t ab_min = vmin_u8(vget_high_u8(ab07_min), vget_low_u8(ab07_min));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  ab_max = vpmax_u8(ab_max, ab_max);
+  ab_min = vpmin_u8(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u8((uint8_t *)max, ab_max, 0);
+  vst1_lane_u8((uint8_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_pred_neon.c b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
new file mode 100644
index 0000000000..b17f7fca7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_pred_neon.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  if (width > 8) {
+    do {
+      const uint8_t *pred_ptr = pred;
+      const uint8_t *ref_ptr = ref;
+      uint8_t *comp_pred_ptr = comp_pred;
+      int w = width;
+
+      do {
+        const uint8x16_t p = vld1q_u8(pred_ptr);
+        const uint8x16_t r = vld1q_u8(ref_ptr);
+        const uint8x16_t avg = vrhaddq_u8(p, r);
+
+        vst1q_u8(comp_pred_ptr, avg);
+
+        ref_ptr += 16;
+        pred_ptr += 16;
+        comp_pred_ptr += 16;
+        w -= 16;
+      } while (w != 0);
+
+      ref += ref_stride;
+      pred += width;
+      comp_pred += width;
+    } while (--height != 0);
+  } else if (width == 8) {
+    int h = height / 2;
+
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+      const uint8x16_t avg = vrhaddq_u8(p, r);
+
+      vst1q_u8(comp_pred, avg);
+
+      ref += 2 * ref_stride;
+      pred += 16;
+      comp_pred += 16;
+    } while (--h != 0);
+  } else {
+    int h = height / 4;
+    assert(width == 4);
+
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      const uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+      const uint8x16_t avg = vrhaddq_u8(p, r);
+
+      vst1q_u8(comp_pred, avg);
+
+      ref += 4 * ref_stride;
+      pred += 16;
+      comp_pred += 16;
+    } while (--h != 0);
+  }
+}
+
+void aom_dist_wtd_comp_avg_pred_neon(uint8_t *comp_pred, const uint8_t *pred,
+                                     int width, int height, const uint8_t *ref,
+                                     int ref_stride,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  if (width > 8) {
+    do {
+      const uint8_t *pred_ptr = pred;
+      const uint8_t *ref_ptr = ref;
+      uint8_t *comp_pred_ptr = comp_pred;
+      int w = width;
+
+      do {
+        const uint8x16_t p = vld1q_u8(pred_ptr);
+        const uint8x16_t r = vld1q_u8(ref_ptr);
+
+        const uint8x16_t wtd_avg =
+            dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+        vst1q_u8(comp_pred_ptr, wtd_avg);
+
+        ref_ptr += 16;
+        pred_ptr += 16;
+        comp_pred_ptr += 16;
+        w -= 16;
+      } while (w != 0);
+
+      ref += ref_stride;
+      pred += width;
+      comp_pred += width;
+    } while (--height != 0);
+  } else if (width == 8) {
+    int h = height / 2;
+
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      const uint8x16_t r = load_u8_8x2(ref, ref_stride);
+
+      const uint8x16_t wtd_avg =
+          dist_wtd_avg_u8x16(r, p, fwd_offset, bck_offset);
+
+      vst1q_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 16;
+      comp_pred += 16;
+    } while (--h != 0);
+  } else {
+    int h = height / 2;
+    assert(width == 4);
+
+    do {
+      const uint8x8_t p = vld1_u8(pred);
+      const uint8x8_t r = load_unaligned_u8_4x2(ref, ref_stride);
+
+      const uint8x8_t wtd_avg = dist_wtd_avg_u8x8(r, p, vget_low_u8(fwd_offset),
+                                                  vget_low_u8(bck_offset));
+
+      vst1_u8(comp_pred, wtd_avg);
+
+      ref += 2 * ref_stride;
+      pred += 8;
+      comp_pred += 8;
+    } while (--h != 0);
+  }
+}
+
+void aom_comp_mask_pred_neon(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int src_stride0 = invert_mask ? width : ref_stride;
+  const int src_stride1 = invert_mask ? ref_stride : width;
+
+  if (width > 8) {
+    do {
+      const uint8_t *src0_ptr = src0;
+      const uint8_t *src1_ptr = src1;
+      const uint8_t *mask_ptr = mask;
+      uint8_t *comp_pred_ptr = comp_pred;
+      int w = width;
+
+      do {
+        const uint8x16_t s0 = vld1q_u8(src0_ptr);
+        const uint8x16_t s1 = vld1q_u8(src1_ptr);
+        const uint8x16_t m0 = vld1q_u8(mask_ptr);
+
+        uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, s0, s1);
+
+        vst1q_u8(comp_pred_ptr, blend_u8);
+
+        src0_ptr += 16;
+        src1_ptr += 16;
+        mask_ptr += 16;
+        comp_pred_ptr += 16;
+        w -= 16;
+      } while (w != 0);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += width;
+    } while (--height != 0);
+  } else if (width == 8) {
+    do {
+      const uint8x8_t s0 = vld1_u8(src0);
+      const uint8x8_t s1 = vld1_u8(src1);
+      const uint8x8_t m0 = vld1_u8(mask);
+
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      vst1_u8(comp_pred, blend_u8);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += 8;
+    } while (--height != 0);
+  } else {
+    int h = height / 2;
+    assert(width == 4);
+
+    do {
+      const uint8x8_t s0 = load_unaligned_u8(src0, src_stride0);
+      const uint8x8_t s1 = load_unaligned_u8(src1, src_stride1);
+      const uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+      uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, s0, s1);
+
+      vst1_u8(comp_pred, blend_u8);
+
+      src0 += 2 * src_stride0;
+      src1 += 2 * src_stride1;
+      mask += 2 * mask_stride;
+      comp_pred += 8;
+    } while (--h != 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/avg_sve.c b/third_party/aom/aom_dsp/arm/avg_sve.c
new file mode 100644
index 0000000000..bbf5a9447c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/avg_sve.c
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_ports/mem.h"
+
+int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) {
+  assert(bwl >= 2 && bwl <= 5);
+  int width = 4 << bwl;
+
+  int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+  int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+
+  do {
+    int16x8_t r0 = vld1q_s16(ref);
+    int16x8_t s0 = vld1q_s16(src);
+
+    // diff: dynamic range [-510, 510] 10 (signed) bits.
+    int16x8_t diff0 = vsubq_s16(r0, s0);
+    // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+    v_mean[0] = vaddq_s16(v_mean[0], diff0);
+
+    // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+    sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0);
+
+    int16x8_t r1 = vld1q_s16(ref + 8);
+    int16x8_t s1 = vld1q_s16(src + 8);
+
+    // diff: dynamic range [-510, 510] 10 (signed) bits.
+    int16x8_t diff1 = vsubq_s16(r1, s1);
+    // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits.
+    v_mean[1] = vaddq_s16(v_mean[1], diff1);
+
+    // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits.
+    sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1);
+
+    ref += 16;
+    src += 16;
+    width -= 16;
+  } while (width != 0);
+
+  // Dynamic range [0, 65280], 16 (unsigned) bits.
+  const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1])));
+  const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+
+  // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits.
+  return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2)));
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
new file mode 100644
index 0000000000..1bc3b80310
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_a64_mask_neon.c
@@ -0,0 +1,492 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+uint8x8_t alpha_blend_a64_d16_u16x8(uint16x8_t m, uint16x8_t a, uint16x8_t b,
+                                    uint16x8_t round_offset) {
+  const uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(m), vget_low_u16(a));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(m), vget_high_u16(a));
+
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));
+
+  uint16x4_t blend_u16_lo = vshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi = vshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  uint16x8_t res = vcombine_u16(blend_u16_lo, blend_u16_hi);
+
+  res = vqsubq_u16(res, round_offset);
+
+  return vqrshrn_n_u16(res,
+                       2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS);
+}
+
+void aom_lowbd_blend_a64_d16_mask_neon(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  (void)conv_params;
+
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - ROUND0_BITS;
+  const int round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));
+  const uint16x8_t offset_vec = vdupq_n_u16(round_offset);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 1) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint8x8_t blend =
+              alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+          vst1_u8(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
+
+void aom_blend_a64_mask_neon(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subw, int subh) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if ((subw | subh) == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t blend = alpha_blend_a64_u8x16(m0, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = load_unaligned_u8_4x2(mask, mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x16_t m2 = vld1q_u8(mask + 0 * mask_stride + 2 * i + 16);
+          uint8x16_t m3 = vld1q_u8(mask + 1 * mask_stride + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16_4(m0, m1, m2, m3);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 8);
+        uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w > 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 2 * i);
+          uint8x16_t m1 = vld1q_u8(mask + 2 * i + 16);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_pairwise_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask);
+        uint8x8_t m1 = vld1_u8(mask + 8);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w > 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + i);
+          uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + i);
+          uint8x16_t s0 = vld1q_u8(src0 + i);
+          uint8x16_t s1 = vld1q_u8(src1 + i);
+
+          uint8x16_t m_avg = avg_blend_u8x16(m0, m1);
+          uint8x16_t blend = alpha_blend_a64_u8x16(m_avg, s0, s1);
+
+          vst1q_u8(dst + i, blend);
+
+          i += 16;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else if (w == 8) {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t s0 = vld1_u8(src0);
+        uint8x8_t s1 = vld1_u8(src1);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0, m1);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        vst1_u8(dst, blend);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint8x8_t s0 = load_unaligned_u8_4x2(src0, src0_stride);
+        uint8x8_t s1 = load_unaligned_u8_4x2(src1, src1_stride);
+
+        uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3);
+        uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1);
+
+        store_u8x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/blend_neon.h b/third_party/aom/aom_dsp/arm/blend_neon.h
new file mode 100644
index 0000000000..c8a03224e4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blend_neon.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_BLEND_NEON_H_
+#define AOM_AOM_DSP_ARM_BLEND_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/blend.h"
+
+static INLINE uint8x16_t alpha_blend_a64_u8x16(uint8x16_t m, uint8x16_t a,
+                                               uint8x16_t b) {
+  const uint8x16_t m_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m), vget_low_u8(a));
+  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m), vget_high_u8(a));
+
+  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m_inv), vget_low_u8(b));
+  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m_inv), vget_high_u8(b));
+
+  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u8(blend_u8_lo, blend_u8_hi);
+}
+
+static INLINE uint8x8_t alpha_blend_a64_u8x8(uint8x8_t m, uint8x8_t a,
+                                             uint8x8_t b) {
+  const uint8x8_t m_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint16x8_t blend_u16 = vmull_u8(m, a);
+
+  blend_u16 = vmlal_u8(blend_u16, m_inv, b);
+
+  return vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE uint16x8_t alpha_blend_a64_u16x8(uint16x8_t m, uint16x8_t a,
+                                               uint16x8_t b) {
+  uint16x8_t m_inv = vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u32_lo = vmull_u16(vget_low_u16(a), vget_low_u16(m));
+  uint32x4_t blend_u32_hi = vmull_u16(vget_high_u16(a), vget_high_u16(m));
+
+  blend_u32_lo = vmlal_u16(blend_u32_lo, vget_low_u16(b), vget_low_u16(m_inv));
+  blend_u32_hi =
+      vmlal_u16(blend_u32_hi, vget_high_u16(b), vget_high_u16(m_inv));
+
+  uint16x4_t blend_u16_lo =
+      vrshrn_n_u32(blend_u32_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint16x4_t blend_u16_hi =
+      vrshrn_n_u32(blend_u32_hi, AOM_BLEND_A64_ROUND_BITS);
+
+  return vcombine_u16(blend_u16_lo, blend_u16_hi);
+}
+
+static INLINE uint16x4_t alpha_blend_a64_u16x4(uint16x4_t m, uint16x4_t a,
+                                               uint16x4_t b) {
+  const uint16x4_t m_inv = vsub_u16(vdup_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);
+
+  uint32x4_t blend_u16 = vmull_u16(m, a);
+
+  blend_u16 = vmlal_u16(blend_u16, m_inv, b);
+
+  return vrshrn_n_u32(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE uint8x8_t avg_blend_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrhadd_u8(a, b);
+}
+
+static INLINE uint8x16_t avg_blend_u8x16(uint8x16_t a, uint8x16_t b) {
+  return vrhaddq_u8(a, b);
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8(uint8x8_t a, uint8x8_t b) {
+  return vrshr_n_u8(vpadd_u8(a, b), 1);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16(uint8x16_t a, uint8x16_t b) {
+#if AOM_ARCH_AARCH64
+  return vrshrq_n_u8(vpaddq_u8(a, b), 1);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  return vrshrq_n_u8(vcombine_u8(sum_pairwise_a, sum_pairwise_b), 1);
+#endif  // AOM_ARCH_AARCH64
+}
+
+static INLINE uint8x8_t avg_blend_pairwise_u8x8_4(uint8x8_t a, uint8x8_t b,
+                                                  uint8x8_t c, uint8x8_t d) {
+  uint8x8_t a_c = vpadd_u8(a, c);
+  uint8x8_t b_d = vpadd_u8(b, d);
+  return vrshr_n_u8(vqadd_u8(a_c, b_d), 2);
+}
+
+static INLINE uint8x16_t avg_blend_pairwise_u8x16_4(uint8x16_t a, uint8x16_t b,
+                                                    uint8x16_t c,
+                                                    uint8x16_t d) {
+#if AOM_ARCH_AARCH64
+  uint8x16_t a_c = vpaddq_u8(a, c);
+  uint8x16_t b_d = vpaddq_u8(b, d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#else
+  uint8x8_t sum_pairwise_a = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
+  uint8x8_t sum_pairwise_b = vpadd_u8(vget_low_u8(b), vget_high_u8(b));
+  uint8x8_t sum_pairwise_c = vpadd_u8(vget_low_u8(c), vget_high_u8(c));
+  uint8x8_t sum_pairwise_d = vpadd_u8(vget_low_u8(d), vget_high_u8(d));
+  uint8x16_t a_c = vcombine_u8(sum_pairwise_a, sum_pairwise_c);
+  uint8x16_t b_d = vcombine_u8(sum_pairwise_b, sum_pairwise_d);
+  return vrshrq_n_u8(vqaddq_u8(a_c, b_d), 2);
+#endif  // AOM_ARCH_AARCH64
+}
+
+#endif  // AOM_AOM_DSP_ARM_BLEND_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
new file mode 100644
index 0000000000..f2ada93e95
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_neon.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  do {
+    int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += 2 * stride;
+    i -= 2;
+  } while (i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_neon(const int16_t *data, int stride,
+                                            int bh, int *x_sum,
+                                            int64_t *x2_sum) {
+  int i = bh;
+  int32x4_t sum = vdupq_n_s32(0);
+  int32x4_t sse = vdupq_n_s32(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t and the maximum
+  // value for bh is 32, we don't have to worry about sse overflowing.
+
+  do {
+    int16x8_t d = vld1q_s16(data);
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = vmlal_s16(sse, vget_low_s16(d), vget_low_s16(d));
+    sse = vmlal_s16(sse, vget_high_s16(d), vget_high_s16(d));
+
+    data += stride;
+  } while (--i != 0);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_long_add_s32x4(sse);
+}
+
+static INLINE void get_blk_sse_sum_large_neon(const int16_t *data, int stride,
+                                              int bw, int bh, int *x_sum,
+                                              int64_t *x2_sum) {
+  int32x4_t sum = vdupq_n_s32(0);
+  int64x2_t sse = vdupq_n_s64(0);
+
+  // Input is 12-bit wide, so we can add up to 127 squared elements in a signed
+  // 32-bits element. Since we're accumulating into an int32x4_t vector that
+  // means we can process up to (127*4)/bw rows before we need to widen to
+  // 64 bits.
+
+  int i_limit = (127 * 4) / bw;
+  int i_tmp = bh > i_limit ? i_limit : bh;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32 = vdupq_n_s32(0);
+    do {
+      int j = bw;
+      const int16_t *data_ptr = data;
+      do {
+        int16x8_t d = vld1q_s16(data_ptr);
+
+        sum = vpadalq_s16(sum, d);
+
+        sse_s32 = vmlal_s16(sse_s32, vget_low_s16(d), vget_low_s16(d));
+        sse_s32 = vmlal_s16(sse_s32, vget_high_s16(d), vget_high_s16(d));
+
+        data_ptr += 8;
+        j -= 8;
+      } while (j != 0);
+
+      data += stride;
+      i++;
+    } while (i < i_tmp && i < bh);
+
+    sse = vpadalq_s32(sse, sse_s32);
+    i_tmp += i_limit;
+  } while (i < bh);
+
+  *x_sum = horizontal_add_s32x4(sum);
+  *x2_sum = horizontal_add_s64x2(sse);
+}
+
+void aom_get_blk_sse_sum_neon(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  if (bw == 4) {
+    get_blk_sse_sum_4xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else if (bw == 8) {
+    get_blk_sse_sum_8xh_neon(data, stride, bh, x_sum, x2_sum);
+  } else {
+    assert(bw % 8 == 0);
+    get_blk_sse_sum_large_neon(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
new file mode 100644
index 0000000000..18bdc5dbfe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/blk_sse_sum_sve.c
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride,
+                                           int bh, int *x_sum,
+                                           int64_t *x2_sum) {
+  int32x4_t sum = vdupq_n_s32(0);
+  int64x2_t sse = vdupq_n_s64(0);
+
+  do {
+    int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride));
+
+    sum = vpadalq_s16(sum, d);
+
+    sse = aom_sdotq_s16(sse, d, d);
+
+    data += 2 * stride;
+    bh -= 2;
+  } while (bh != 0);
+
+  *x_sum = vaddvq_s32(sum);
+  *x2_sum = vaddvq_s64(sse);
+}
+
+static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride,
+                                           int bh, int *x_sum,
+                                           int64_t *x2_sum) {
+  int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int16x8_t d0 = vld1q_s16(data);
+    int16x8_t d1 = vld1q_s16(data + stride);
+
+    sum[0] = vpadalq_s16(sum[0], d0);
+    sum[1] = vpadalq_s16(sum[1], d1);
+
+    sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+    sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+    data += 2 * stride;
+    bh -= 2;
+  } while (bh != 0);
+
+  *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+  *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride,
+                                             int bw, int bh, int *x_sum,
+                                             int64_t *x2_sum) {
+  int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int j = bw;
+    const int16_t *data_ptr = data;
+    do {
+      int16x8_t d0 = vld1q_s16(data_ptr);
+      int16x8_t d1 = vld1q_s16(data_ptr + 8);
+
+      sum[0] = vpadalq_s16(sum[0], d0);
+      sum[1] = vpadalq_s16(sum[1], d1);
+
+      sse[0] = aom_sdotq_s16(sse[0], d0, d0);
+      sse[1] = aom_sdotq_s16(sse[1], d1, d1);
+
+      data_ptr += 16;
+      j -= 16;
+    } while (j != 0);
+
+    data += stride;
+  } while (--bh != 0);
+
+  *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1]));
+  *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh,
+                             int *x_sum, int64_t *x2_sum) {
+  if (bw == 4) {
+    get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum);
+  } else if (bw == 8) {
+    get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum);
+  } else {
+    assert(bw % 16 == 0);
+    get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
new file mode 100644
index 0000000000..19c9b04c57
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dist_wtd_avg_neon.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+#define AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "av1/common/enums.h"
+
+static INLINE uint8x8_t dist_wtd_avg_u8x8(uint8x8_t a, uint8x8_t b,
+                                          uint8x8_t wta, uint8x8_t wtb) {
+  uint16x8_t wtd_sum = vmull_u8(a, wta);
+
+  wtd_sum = vmlal_u8(wtd_sum, b, wtb);
+
+  return vrshrn_n_u16(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint16x4_t dist_wtd_avg_u16x4(uint16x4_t a, uint16x4_t b,
+                                            uint16x4_t wta, uint16x4_t wtb) {
+  uint32x4_t wtd_sum = vmull_u16(a, wta);
+
+  wtd_sum = vmlal_u16(wtd_sum, b, wtb);
+
+  return vrshrn_n_u32(wtd_sum, DIST_PRECISION_BITS);
+}
+
+static INLINE uint8x16_t dist_wtd_avg_u8x16(uint8x16_t a, uint8x16_t b,
+                                            uint8x16_t wta, uint8x16_t wtb) {
+  uint16x8_t wtd_sum_lo = vmull_u8(vget_low_u8(a), vget_low_u8(wta));
+  uint16x8_t wtd_sum_hi = vmull_u8(vget_high_u8(a), vget_high_u8(wta));
+
+  wtd_sum_lo = vmlal_u8(wtd_sum_lo, vget_low_u8(b), vget_low_u8(wtb));
+  wtd_sum_hi = vmlal_u8(wtd_sum_hi, vget_high_u8(b), vget_high_u8(wtb));
+
+  uint8x8_t wtd_avg_lo = vrshrn_n_u16(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint8x8_t wtd_avg_hi = vrshrn_n_u16(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u8(wtd_avg_lo, wtd_avg_hi);
+}
+
+static INLINE uint16x8_t dist_wtd_avg_u16x8(uint16x8_t a, uint16x8_t b,
+                                            uint16x8_t wta, uint16x8_t wtb) {
+  uint32x4_t wtd_sum_lo = vmull_u16(vget_low_u16(a), vget_low_u16(wta));
+  uint32x4_t wtd_sum_hi = vmull_u16(vget_high_u16(a), vget_high_u16(wta));
+
+  wtd_sum_lo = vmlal_u16(wtd_sum_lo, vget_low_u16(b), vget_low_u16(wtb));
+  wtd_sum_hi = vmlal_u16(wtd_sum_hi, vget_high_u16(b), vget_high_u16(wtb));
+
+  uint16x4_t wtd_avg_lo = vrshrn_n_u32(wtd_sum_lo, DIST_PRECISION_BITS);
+  uint16x4_t wtd_avg_hi = vrshrn_n_u32(wtd_sum_hi, DIST_PRECISION_BITS);
+
+  return vcombine_u16(wtd_avg_lo, wtd_avg_hi);
+}
+
+#endif  // AOM_AOM_DSP_ARM_DIST_WTD_AVG_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/dot_sve.h b/third_party/aom/aom_dsp/arm/dot_sve.h
new file mode 100644
index 0000000000..cf49f23606
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/dot_sve.h
@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_DOT_SVE_H_
+#define AOM_AOM_DSP_ARM_DOT_SVE_H_
+
+#include <arm_neon_sve_bridge.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+// Dot product instructions operating on 16-bit input elements are exclusive to
+// the SVE instruction set. However, we can access these instructions from a
+// predominantly Neon context by making use of the Neon-SVE bridge intrinsics
+// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE
+// vector (if it's longer than 128 bits) being "don't care".
+
+// While sub-optimal on machines that have SVE vector length > 128-bit - as the
+// remainder of the vector is unused - this approach is still beneficial when
+// compared to a Neon-only solution.
+
+static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x,
+                                       uint16x8_t y) {
+  return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc),
+                                   svset_neonq_u16(svundef_u16(), x),
+                                   svset_neonq_u16(svundef_u16(), y)));
+}
+
+static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) {
+  return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc),
+                                   svset_neonq_s16(svundef_s16(), x),
+                                   svset_neonq_s16(svundef_s16(), y)));
+}
+
+#endif  // AOM_AOM_DSP_ARM_DOT_SVE_H_
diff --git a/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
new file mode 100644
index 0000000000..a4d6322f24
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/fwd_txfm_neon.c
@@ -0,0 +1,304 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static void aom_fdct4x4_helper(const int16_t *input, int stride,
+                               int16x4_t *input_0, int16x4_t *input_1,
+                               int16x4_t *input_2, int16x4_t *input_3) {
+  *input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  *input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  *input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  *input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  // If the very first value != 0, then add 1.
+  if (input[0] != 0) {
+    const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
+    *input_0 = vadd_s16(*input_0, one);
+  }
+
+  for (int i = 0; i < 2; ++i) {
+    const int16x8_t input_01 = vcombine_s16(*input_0, *input_1);
+    const int16x8_t input_32 = vcombine_s16(*input_3, *input_2);
+
+    // in_0 +/- in_3, in_1 +/- in_2
+    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+    // step_0 +/- step_1, step_2 +/- step_3
+    const int16x4_t s_0 = vget_low_s16(s_01);
+    const int16x4_t s_1 = vget_high_s16(s_01);
+    const int16x4_t s_2 = vget_high_s16(s_32);
+    const int16x4_t s_3 = vget_low_s16(s_32);
+
+    // (s_0 +/- s_1) * cospi_16_64
+    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, (int32_t)cospi_16_64);
+    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, (int32_t)cospi_16_64);
+
+    // fdct_round_shift
+    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+    // s_3 * cospi_8_64 + s_2 * cospi_24_64
+    // s_3 * cospi_24_64 - s_2 * cospi_8_64
+    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, (int32_t)cospi_8_64);
+    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, (int32_t)cospi_24_64);
+
+    const int32x4_t temp3 =
+        vmlal_n_s16(s_3_cospi_8_64, s_2, (int32_t)cospi_24_64);
+    const int32x4_t temp4 =
+        vmlsl_n_s16(s_3_cospi_24_64, s_2, (int32_t)cospi_8_64);
+
+    // fdct_round_shift
+    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+    // Only transpose the first pass
+    if (i == 0) {
+      transpose_elems_inplace_s16_4x4(&out_0, &out_1, &out_2, &out_3);
+    }
+
+    *input_0 = out_0;
+    *input_1 = out_1;
+    *input_2 = out_2;
+    *input_3 = out_3;
+  }
+}
+
+void aom_fdct4x4_neon(const int16_t *input, tran_low_t *final_output,
+                      int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  store_s16q_to_tran_low(final_output + 0 * 8, out_01);
+  store_s16q_to_tran_low(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct4x4_lp_neon(const int16_t *input, int16_t *final_output,
+                         int stride) {
+  // input[M * stride] * 16
+  int16x4_t input_0, input_1, input_2, input_3;
+
+  aom_fdct4x4_helper(input, stride, &input_0, &input_1, &input_2, &input_3);
+
+  // Not quite a rounding shift. Only add 1 despite shifting by 2.
+  const int16x8_t one = vdupq_n_s16(1);
+  int16x8_t out_01 = vcombine_s16(input_0, input_1);
+  int16x8_t out_23 = vcombine_s16(input_2, input_3);
+  out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
+  out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
+  vst1q_s16(final_output + 0 * 8, out_01);
+  vst1q_s16(final_output + 1 * 8, out_23);
+}
+
+void aom_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+  // stage 1
+  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  for (int i = 0; i < 2; ++i) {
+    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+    // fdct4(step, step);
+    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+    // fdct4(step, step);
+    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+    v_t0_lo = vmulq_n_s32(v_t0_lo, (int32_t)cospi_16_64);
+    v_t0_hi = vmulq_n_s32(v_t0_hi, (int32_t)cospi_16_64);
+    v_t1_lo = vmulq_n_s32(v_t1_lo, (int32_t)cospi_16_64);
+    v_t1_hi = vmulq_n_s32(v_t1_hi, (int32_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+    }
+    // Stage 2
+    v_x0 = vsubq_s16(v_s6, v_s5);
+    v_x1 = vaddq_s16(v_s6, v_s5);
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x8_t ab = vcombine_s16(a, b);
+      const int16x8_t cd = vcombine_s16(c, d);
+      // Stage 3
+      v_x0 = vaddq_s16(v_s4, ab);
+      v_x1 = vsubq_s16(v_s4, ab);
+      v_x2 = vsubq_s16(v_s7, cd);
+      v_x3 = vaddq_s16(v_s7, cd);
+    }
+    // Stage 4
+    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+    {
+      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+    }
+    // transpose 8x8
+    {
+      // 00 01 02 03 40 41 42 43
+      // 10 11 12 13 50 51 52 53
+      // 20 21 22 23 60 61 62 63
+      // 30 31 32 33 70 71 72 73
+      // 04 05 06 07 44 45 46 47
+      // 14 15 16 17 54 55 56 57
+      // 24 25 26 27 64 65 66 67
+      // 34 35 36 37 74 75 76 77
+      const int32x4x2_t r02_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
+      const int32x4x2_t r13_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
+      const int32x4x2_t r46_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
+      const int32x4x2_t r57_s32 =
+          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
+      const int16x8x2_t r01_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                    vreinterpretq_s16_s32(r13_s32.val[0]));
+      const int16x8x2_t r23_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                    vreinterpretq_s16_s32(r13_s32.val[1]));
+      const int16x8x2_t r45_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                    vreinterpretq_s16_s32(r57_s32.val[0]));
+      const int16x8x2_t r67_s16 =
+          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                    vreinterpretq_s16_s32(r57_s32.val[1]));
+      input_0 = r01_s16.val[0];
+      input_1 = r01_s16.val[1];
+      input_2 = r23_s16.val[0];
+      input_3 = r23_s16.val[1];
+      input_4 = r45_s16.val[0];
+      input_5 = r45_s16.val[1];
+      input_6 = r67_s16.val[0];
+      input_7 = r67_s16.val[1];
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }  // for
+  {
+    // from aom_dct_sse2.c
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+    input_0 = vhsubq_s16(input_0, sign_in0);
+    input_1 = vhsubq_s16(input_1, sign_in1);
+    input_2 = vhsubq_s16(input_2, sign_in2);
+    input_3 = vhsubq_s16(input_3, sign_in3);
+    input_4 = vhsubq_s16(input_4, sign_in4);
+    input_5 = vhsubq_s16(input_5, sign_in5);
+    input_6 = vhsubq_s16(input_6, sign_in6);
+    input_7 = vhsubq_s16(input_7, sign_in7);
+    // store results
+    vst1q_s16(&final_output[0 * 8], input_0);
+    vst1q_s16(&final_output[1 * 8], input_1);
+    vst1q_s16(&final_output[2 * 8], input_2);
+    vst1q_s16(&final_output[3 * 8], input_3);
+    vst1q_s16(&final_output[4 * 8], input_4);
+    vst1q_s16(&final_output[5 * 8], input_5);
+    vst1q_s16(&final_output[6 * 8], input_6);
+    vst1q_s16(&final_output[7 * 8], input_7);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/hadamard_neon.c b/third_party/aom/aom_dsp/arm/hadamard_neon.c
new file mode 100644
index 0000000000..d0f59227db
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/hadamard_neon.c
@@ -0,0 +1,325 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1,
+                                         int16x4_t *a2, int16x4_t *a3) {
+  const int16x4_t b0 = vhadd_s16(*a0, *a1);
+  const int16x4_t b1 = vhsub_s16(*a0, *a1);
+  const int16x4_t b2 = vhadd_s16(*a2, *a3);
+  const int16x4_t b3 = vhsub_s16(*a2, *a3);
+
+  *a0 = vadd_s16(b0, b2);
+  *a1 = vadd_s16(b1, b3);
+  *a2 = vsub_s16(b0, b2);
+  *a3 = vsub_s16(b1, b3);
+}
+
+void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  int16x4_t a0 = vld1_s16(src_diff);
+  int16x4_t a1 = vld1_s16(src_diff + src_stride);
+  int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride);
+  int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride);
+
+  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+  hadamard_4x4_one_pass(&a0, &a1, &a2, &a3);
+
+  store_s16_to_tran_low(coeff, a0);
+  store_s16_to_tran_low(coeff + 4, a1);
+  store_s16_to_tran_low(coeff + 8, a2);
+  store_s16_to_tran_low(coeff + 12, a3);
+}
+
+static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2,
+                                 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5,
+                                 int16x8_t *a6, int16x8_t *a7) {
+  const int16x8_t b0 = vaddq_s16(*a0, *a1);
+  const int16x8_t b1 = vsubq_s16(*a0, *a1);
+  const int16x8_t b2 = vaddq_s16(*a2, *a3);
+  const int16x8_t b3 = vsubq_s16(*a2, *a3);
+  const int16x8_t b4 = vaddq_s16(*a4, *a5);
+  const int16x8_t b5 = vsubq_s16(*a4, *a5);
+  const int16x8_t b6 = vaddq_s16(*a6, *a7);
+  const int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  const int16x8_t c0 = vaddq_s16(b0, b2);
+  const int16x8_t c1 = vaddq_s16(b1, b3);
+  const int16x8_t c2 = vsubq_s16(b0, b2);
+  const int16x8_t c3 = vsubq_s16(b1, b3);
+  const int16x8_t c4 = vaddq_s16(b4, b6);
+  const int16x8_t c5 = vaddq_s16(b5, b7);
+  const int16x8_t c6 = vsubq_s16(b4, b6);
+  const int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a1 = vsubq_s16(c2, c6);
+  *a2 = vsubq_s16(c0, c4);
+  *a3 = vaddq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+  *a6 = vsubq_s16(c1, c5);
+  *a7 = vaddq_s16(c1, c5);
+}
+
+void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  store_s16q_to_tran_low(coeff + 0, a0);
+  store_s16q_to_tran_low(coeff + 8, a1);
+  store_s16q_to_tran_low(coeff + 16, a2);
+  store_s16q_to_tran_low(coeff + 24, a3);
+  store_s16q_to_tran_low(coeff + 32, a4);
+  store_s16q_to_tran_low(coeff + 40, a5);
+  store_s16q_to_tran_low(coeff + 48, a6);
+  store_s16q_to_tran_low(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  int16x8_t a0 = vld1q_s16(src_diff);
+  int16x8_t a1 = vld1q_s16(src_diff + src_stride);
+  int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  // Skip the second transpose because it is not required.
+
+  vst1q_s16(coeff + 0, a0);
+  vst1q_s16(coeff + 8, a1);
+  vst1q_s16(coeff + 16, a2);
+  vst1q_s16(coeff + 24, a3);
+  vst1q_s16(coeff + 32, a4);
+  vst1q_s16(coeff + 40, a5);
+  vst1q_s16(coeff + 48, a6);
+  vst1q_s16(coeff + 56, a7);
+}
+
+void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff,
+                                   ptrdiff_t src_stride, int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64));
+  }
+}
+
+void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride,
+                           coeff + 0);
+  /* Top right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride,
+                           coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride,
+                           coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride,
+                           coeff + 192);
+
+  for (int i = 0; i < 64; i += 8) {
+    const int16x8_t a0 = vld1q_s16(coeff + 0);
+    const int16x8_t a1 = vld1q_s16(coeff + 64);
+    const int16x8_t a2 = vld1q_s16(coeff + 128);
+    const int16x8_t a3 = vld1q_s16(coeff + 192);
+
+    const int16x8_t b0 = vhaddq_s16(a0, a1);
+    const int16x8_t b1 = vhsubq_s16(a0, a1);
+    const int16x8_t b2 = vhaddq_s16(a2, a3);
+    const int16x8_t b3 = vhsubq_s16(a2, a3);
+
+    const int16x8_t c0 = vaddq_s16(b0, b2);
+    const int16x8_t c1 = vaddq_s16(b1, b3);
+    const int16x8_t c2 = vsubq_s16(b0, b2);
+    const int16x8_t c3 = vsubq_s16(b1, b3);
+
+    vst1q_s16(coeff + 0, c0);
+    vst1q_s16(coeff + 64, c1);
+    vst1q_s16(coeff + 128, c2);
+    vst1q_s16(coeff + 192, c3);
+
+    coeff += 8;
+  }
+}
+
+void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64);
+  /* Bottom left. */
+  aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128);
+  /* Bottom right. */
+  aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192);
+
+  // Each iteration of the loop operates on entire rows (16 samples each)
+  // because we need to swap the second and third quarters of every row in the
+  // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for
+  // loop at the end of aom_hadamard_16x16_c.
+  for (int i = 0; i < 64; i += 16) {
+    const int32x4_t a00 = vld1q_s32(coeff + 0);
+    const int32x4_t a01 = vld1q_s32(coeff + 64);
+    const int32x4_t a02 = vld1q_s32(coeff + 128);
+    const int32x4_t a03 = vld1q_s32(coeff + 192);
+
+    const int32x4_t b00 = vhaddq_s32(a00, a01);
+    const int32x4_t b01 = vhsubq_s32(a00, a01);
+    const int32x4_t b02 = vhaddq_s32(a02, a03);
+    const int32x4_t b03 = vhsubq_s32(a02, a03);
+
+    const int32x4_t c00 = vaddq_s32(b00, b02);
+    const int32x4_t c01 = vaddq_s32(b01, b03);
+    const int32x4_t c02 = vsubq_s32(b00, b02);
+    const int32x4_t c03 = vsubq_s32(b01, b03);
+
+    const int32x4_t a10 = vld1q_s32(coeff + 4 + 0);
+    const int32x4_t a11 = vld1q_s32(coeff + 4 + 64);
+    const int32x4_t a12 = vld1q_s32(coeff + 4 + 128);
+    const int32x4_t a13 = vld1q_s32(coeff + 4 + 192);
+
+    const int32x4_t b10 = vhaddq_s32(a10, a11);
+    const int32x4_t b11 = vhsubq_s32(a10, a11);
+    const int32x4_t b12 = vhaddq_s32(a12, a13);
+    const int32x4_t b13 = vhsubq_s32(a12, a13);
+
+    const int32x4_t c10 = vaddq_s32(b10, b12);
+    const int32x4_t c11 = vaddq_s32(b11, b13);
+    const int32x4_t c12 = vsubq_s32(b10, b12);
+    const int32x4_t c13 = vsubq_s32(b11, b13);
+
+    const int32x4_t a20 = vld1q_s32(coeff + 8 + 0);
+    const int32x4_t a21 = vld1q_s32(coeff + 8 + 64);
+    const int32x4_t a22 = vld1q_s32(coeff + 8 + 128);
+    const int32x4_t a23 = vld1q_s32(coeff + 8 + 192);
+
+    const int32x4_t b20 = vhaddq_s32(a20, a21);
+    const int32x4_t b21 = vhsubq_s32(a20, a21);
+    const int32x4_t b22 = vhaddq_s32(a22, a23);
+    const int32x4_t b23 = vhsubq_s32(a22, a23);
+
+    const int32x4_t c20 = vaddq_s32(b20, b22);
+    const int32x4_t c21 = vaddq_s32(b21, b23);
+    const int32x4_t c22 = vsubq_s32(b20, b22);
+    const int32x4_t c23 = vsubq_s32(b21, b23);
+
+    const int32x4_t a30 = vld1q_s32(coeff + 12 + 0);
+    const int32x4_t a31 = vld1q_s32(coeff + 12 + 64);
+    const int32x4_t a32 = vld1q_s32(coeff + 12 + 128);
+    const int32x4_t a33 = vld1q_s32(coeff + 12 + 192);
+
+    const int32x4_t b30 = vhaddq_s32(a30, a31);
+    const int32x4_t b31 = vhsubq_s32(a30, a31);
+    const int32x4_t b32 = vhaddq_s32(a32, a33);
+    const int32x4_t b33 = vhsubq_s32(a32, a33);
+
+    const int32x4_t c30 = vaddq_s32(b30, b32);
+    const int32x4_t c31 = vaddq_s32(b31, b33);
+    const int32x4_t c32 = vsubq_s32(b30, b32);
+    const int32x4_t c33 = vsubq_s32(b31, b33);
+
+    vst1q_s32(coeff + 0 + 0, c00);
+    vst1q_s32(coeff + 0 + 4, c20);
+    vst1q_s32(coeff + 0 + 8, c10);
+    vst1q_s32(coeff + 0 + 12, c30);
+
+    vst1q_s32(coeff + 64 + 0, c01);
+    vst1q_s32(coeff + 64 + 4, c21);
+    vst1q_s32(coeff + 64 + 8, c11);
+    vst1q_s32(coeff + 64 + 12, c31);
+
+    vst1q_s32(coeff + 128 + 0, c02);
+    vst1q_s32(coeff + 128 + 4, c22);
+    vst1q_s32(coeff + 128 + 8, c12);
+    vst1q_s32(coeff + 128 + 12, c32);
+
+    vst1q_s32(coeff + 192 + 0, c03);
+    vst1q_s32(coeff + 192 + 4, c23);
+    vst1q_s32(coeff + 192 + 8, c13);
+    vst1q_s32(coeff + 192 + 12, c33);
+
+    coeff += 16;
+  }
+}
+
+void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  /* Top left first. */
+  aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0);
+  /* Top right. */
+  aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride,
+                          coeff + 256);
+  /* Bottom left. */
+  aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride,
+                          coeff + 512);
+  /* Bottom right. */
+  aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride,
+                          coeff + 768);
+
+  for (int i = 0; i < 256; i += 4) {
+    const int32x4_t a0 = vld1q_s32(coeff);
+    const int32x4_t a1 = vld1q_s32(coeff + 256);
+    const int32x4_t a2 = vld1q_s32(coeff + 512);
+    const int32x4_t a3 = vld1q_s32(coeff + 768);
+
+    const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+    const int32x4_t c0 = vaddq_s32(b0, b2);
+    const int32x4_t c1 = vaddq_s32(b1, b3);
+    const int32x4_t c2 = vsubq_s32(b0, b2);
+    const int32x4_t c3 = vsubq_s32(b1, b3);
+
+    vst1q_s32(coeff + 0, c0);
+    vst1q_s32(coeff + 256, c1);
+    vst1q_s32(coeff + 512, c2);
+    vst1q_s32(coeff + 768, c3);
+
+    coeff += 4;
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
new file mode 100644
index 0000000000..47d5dae012
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_neon.c
@@ -0,0 +1,125 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  This source code is subject to the terms of the BSD 2 Clause License and
+ *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ *  was not distributed with this source code in the LICENSE file, you can
+ *  obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ *  Media Patent License 1.0 was not distributed with this source code in the
+ *  PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+uint32_t aom_highbd_avg_4x4_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  uint16x4_t sum, a0, a1, a2, a3;
+
+  load_u16_4x4(a_ptr, a_stride, &a0, &a1, &a2, &a3);
+
+  sum = vadd_u16(a0, a1);
+  sum = vadd_u16(sum, a2);
+  sum = vadd_u16(sum, a3);
+
+  return (horizontal_add_u16x4(sum) + (1 << 3)) >> 4;
+}
+
+uint32_t aom_highbd_avg_8x8_neon(const uint8_t *a, int a_stride) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a);
+  uint16x8_t sum, a0, a1, a2, a3, a4, a5, a6, a7;
+
+  load_u16_8x8(a_ptr, a_stride, &a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7);
+
+  sum = vaddq_u16(a0, a1);
+  sum = vaddq_u16(sum, a2);
+  sum = vaddq_u16(sum, a3);
+  sum = vaddq_u16(sum, a4);
+  sum = vaddq_u16(sum, a5);
+  sum = vaddq_u16(sum, a6);
+  sum = vaddq_u16(sum, a7);
+
+  return (horizontal_add_u16x8(sum) + (1 << 5)) >> 6;
+}
+
+void aom_highbd_minmax_8x8_neon(const uint8_t *s8, int p, const uint8_t *d8,
+                                int dp, int *min, int *max) {
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(d8);
+
+  const uint16x8_t a0 = vld1q_u16(a_ptr + 0 * p);
+  const uint16x8_t a1 = vld1q_u16(a_ptr + 1 * p);
+  const uint16x8_t a2 = vld1q_u16(a_ptr + 2 * p);
+  const uint16x8_t a3 = vld1q_u16(a_ptr + 3 * p);
+  const uint16x8_t a4 = vld1q_u16(a_ptr + 4 * p);
+  const uint16x8_t a5 = vld1q_u16(a_ptr + 5 * p);
+  const uint16x8_t a6 = vld1q_u16(a_ptr + 6 * p);
+  const uint16x8_t a7 = vld1q_u16(a_ptr + 7 * p);
+
+  const uint16x8_t b0 = vld1q_u16(b_ptr + 0 * dp);
+  const uint16x8_t b1 = vld1q_u16(b_ptr + 1 * dp);
+  const uint16x8_t b2 = vld1q_u16(b_ptr + 2 * dp);
+  const uint16x8_t b3 = vld1q_u16(b_ptr + 3 * dp);
+  const uint16x8_t b4 = vld1q_u16(b_ptr + 4 * dp);
+  const uint16x8_t b5 = vld1q_u16(b_ptr + 5 * dp);
+  const uint16x8_t b6 = vld1q_u16(b_ptr + 6 * dp);
+  const uint16x8_t b7 = vld1q_u16(b_ptr + 7 * dp);
+
+  const uint16x8_t abs_diff0 = vabdq_u16(a0, b0);
+  const uint16x8_t abs_diff1 = vabdq_u16(a1, b1);
+  const uint16x8_t abs_diff2 = vabdq_u16(a2, b2);
+  const uint16x8_t abs_diff3 = vabdq_u16(a3, b3);
+  const uint16x8_t abs_diff4 = vabdq_u16(a4, b4);
+  const uint16x8_t abs_diff5 = vabdq_u16(a5, b5);
+  const uint16x8_t abs_diff6 = vabdq_u16(a6, b6);
+  const uint16x8_t abs_diff7 = vabdq_u16(a7, b7);
+
+  const uint16x8_t max01 = vmaxq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t max23 = vmaxq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t max45 = vmaxq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t max67 = vmaxq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t max0123 = vmaxq_u16(max01, max23);
+  const uint16x8_t max4567 = vmaxq_u16(max45, max67);
+  const uint16x8_t max07 = vmaxq_u16(max0123, max4567);
+
+  const uint16x8_t min01 = vminq_u16(abs_diff0, abs_diff1);
+  const uint16x8_t min23 = vminq_u16(abs_diff2, abs_diff3);
+  const uint16x8_t min45 = vminq_u16(abs_diff4, abs_diff5);
+  const uint16x8_t min67 = vminq_u16(abs_diff6, abs_diff7);
+
+  const uint16x8_t min0123 = vminq_u16(min01, min23);
+  const uint16x8_t min4567 = vminq_u16(min45, min67);
+  const uint16x8_t min07 = vminq_u16(min0123, min4567);
+
+#if AOM_ARCH_AARCH64
+  *max = (int)vmaxvq_u16(max07);
+  *min = (int)vminvq_u16(min07);
+#else
+  // Split into 64-bit vectors and execute pairwise min/max.
+  uint16x4_t ab_max = vmax_u16(vget_high_u16(max07), vget_low_u16(max07));
+  uint16x4_t ab_min = vmin_u16(vget_high_u16(min07), vget_low_u16(min07));
+
+  // Enough runs of vpmax/min propagate the max/min values to every position.
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  ab_max = vpmax_u16(ab_max, ab_max);
+  ab_min = vpmin_u16(ab_min, ab_min);
+
+  *min = *max = 0;  // Clear high bits
+  // Store directly to avoid costly neon->gpr transfer.
+  vst1_lane_u16((uint16_t *)max, ab_max, 0);
+  vst1_lane_u16((uint16_t *)min, ab_min, 0);
+#endif
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
new file mode 100644
index 0000000000..531309b025
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_avg_pred_neon.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_comp_avg_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                   int width, int height, const uint8_t *ref8,
+                                   int ref_stride) {
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  int i = height;
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        uint16x8_t avg = vrhaddq_u16(p, r);
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      uint16x8_t avg = vrhaddq_u16(p, r);
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      uint16x4_t avg = vrhadd_u16(p, r);
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+}
+
+void aom_highbd_comp_mask_pred_neon(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int src_stride0 = invert_mask ? width : ref_stride;
+  const int src_stride1 = invert_mask ? ref_stride : width;
+
+  if (width >= 8) {
+    do {
+      int j = 0;
+
+      do {
+        const uint16x8_t s0 = vld1q_u16(src0 + j);
+        const uint16x8_t s1 = vld1q_u16(src1 + j);
+        const uint16x8_t m0 = vmovl_u8(vld1_u8(mask + j));
+
+        uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(comp_pred + j, blend_u16);
+
+        j += 8;
+      } while (j < width);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += width;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+
+    do {
+      const uint16x4_t s0 = vld1_u16(src0);
+      const uint16x4_t s1 = vld1_u16(src1);
+      const uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(mask)));
+
+      uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      vst1_u16(comp_pred, blend_u16);
+
+      src0 += src_stride0;
+      src1 += src_stride1;
+      mask += mask_stride;
+      comp_pred += 4;
+    } while (--height != 0);
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_neon(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset_u16 = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset_u16 = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width > 8) {
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t p = vld1q_u16(pred + j);
+        const uint16x8_t r = vld1q_u16(ref + j);
+
+        const uint16x8_t avg =
+            dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+        vst1q_u16(comp_pred + j, avg);
+
+        j += 8;
+      } while (j < width);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else if (width == 8) {
+    do {
+      const uint16x8_t p = vld1q_u16(pred);
+      const uint16x8_t r = vld1q_u16(ref);
+
+      const uint16x8_t avg =
+          dist_wtd_avg_u16x8(r, p, fwd_offset_u16, bck_offset_u16);
+
+      vst1q_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  } else {
+    assert(width == 4);
+    do {
+      const uint16x4_t p = vld1_u16(pred);
+      const uint16x4_t r = vld1_u16(ref);
+
+      const uint16x4_t avg = dist_wtd_avg_u16x4(
+          r, p, vget_low_u16(fwd_offset_u16), vget_low_u16(bck_offset_u16));
+
+      vst1_u16(comp_pred, avg);
+
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    } while (--height != 0);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
new file mode 100644
index 0000000000..8b03e91ac3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_hmask_neon.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_hmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      int i = 0;
+      do {
+        uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    const uint16x8_t m0 = vmovl_u8(load_unaligned_dup_u8_4x2(mask));
+    do {
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+      store_u16x4_strided_x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    const uint16x4_t m0 =
+        vget_low_u16(vmovl_u8(load_unaligned_dup_u8_2x4(mask)));
+    do {
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_u16x2_strided_x2(dst, dst_stride, blend);
+
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_hmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
new file mode 100644
index 0000000000..90b44fcc5e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_mask_neon.c
@@ -0,0 +1,473 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+#define HBD_BLEND_A64_D16_MASK(bd, round0_bits)                               \
+  static INLINE uint16x8_t alpha_##bd##_blend_a64_d16_u16x8(                  \
+      uint16x8_t m, uint16x8_t a, uint16x8_t b, int32x4_t round_offset) {     \
+    const uint16x8_t m_inv =                                                  \
+        vsubq_u16(vdupq_n_u16(AOM_BLEND_A64_MAX_ALPHA), m);                   \
+                                                                              \
+    uint32x4_t blend_u32_lo = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_low_u16(m), vget_low_u16(a));    \
+    uint32x4_t blend_u32_hi = vmlal_u16(vreinterpretq_u32_s32(round_offset),  \
+                                        vget_high_u16(m), vget_high_u16(a));  \
+                                                                              \
+    blend_u32_lo =                                                            \
+        vmlal_u16(blend_u32_lo, vget_low_u16(m_inv), vget_low_u16(b));        \
+    blend_u32_hi =                                                            \
+        vmlal_u16(blend_u32_hi, vget_high_u16(m_inv), vget_high_u16(b));      \
+                                                                              \
+    uint16x4_t blend_u16_lo =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_lo),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+    uint16x4_t blend_u16_hi =                                                 \
+        vqrshrun_n_s32(vreinterpretq_s32_u32(blend_u32_hi),                   \
+                       AOM_BLEND_A64_ROUND_BITS + 2 * FILTER_BITS -           \
+                           round0_bits - COMPOUND_ROUND1_BITS);               \
+                                                                              \
+    uint16x8_t blend_u16 = vcombine_u16(blend_u16_lo, blend_u16_hi);          \
+    blend_u16 = vminq_u16(blend_u16, vdupq_n_u16((1 << bd) - 1));             \
+                                                                              \
+    return blend_u16;                                                         \
+  }                                                                           \
+                                                                              \
+  static INLINE void highbd_##bd##_blend_a64_d16_mask_neon(                   \
+      uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,          \
+      uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,  \
+      const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw,      \
+      int subh) {                                                             \
+    const int offset_bits = bd + 2 * FILTER_BITS - round0_bits;               \
+    int32_t round_offset = (1 << (offset_bits - COMPOUND_ROUND1_BITS)) +      \
+                           (1 << (offset_bits - COMPOUND_ROUND1_BITS - 1));   \
+    int32x4_t offset =                                                        \
+        vdupq_n_s32(-(round_offset << AOM_BLEND_A64_ROUND_BITS));             \
+                                                                              \
+    if ((subw | subh) == 0) {                                                 \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));                      \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);         \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride)); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset);           \
+                                                                              \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if ((subw & subh) == 1) {                                          \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x16_t m0 = vld1q_u8(mask + 0 * mask_stride + 2 * i);         \
+            uint8x16_t m1 = vld1q_u8(mask + 1 * mask_stride + 2 * i);         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(            \
+                vget_low_u8(m0), vget_low_u8(m1), vget_high_u8(m0),           \
+                vget_high_u8(m1)));                                           \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);                     \
+          uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg =                                                  \
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else if (subw == 1 && subh == 0) {                                      \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 2 * i);                             \
+            uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);                         \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));     \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += mask_stride;                                                \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);                     \
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);                     \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));       \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    } else {                                                                  \
+      if (w >= 8) {                                                           \
+        do {                                                                  \
+          int i = 0;                                                          \
+          do {                                                                \
+            uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);               \
+            uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);               \
+            uint16x8_t s0 = vld1q_u16(src0 + i);                              \
+            uint16x8_t s1 = vld1q_u16(src1 + i);                              \
+                                                                              \
+            uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));              \
+            uint16x8_t blend =                                                \
+                alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);      \
+                                                                              \
+            vst1q_u16(dst + i, blend);                                        \
+            i += 8;                                                           \
+          } while (i < w);                                                    \
+                                                                              \
+          mask += 2 * mask_stride;                                            \
+          src0 += src0_stride;                                                \
+          src1 += src1_stride;                                                \
+          dst += dst_stride;                                                  \
+        } while (--h != 0);                                                   \
+      } else {                                                                \
+        do {                                                                  \
+          uint8x8_t m0_2 =                                                    \
+              load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride); \
+          uint8x8_t m1_3 =                                                    \
+              load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride); \
+          uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);          \
+          uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);          \
+                                                                              \
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));            \
+          uint16x8_t blend =                                                  \
+              alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset);        \
+                                                                              \
+          store_u16x4_strided_x2(dst, dst_stride, blend);                     \
+                                                                              \
+          mask += 4 * mask_stride;                                            \
+          src0 += 2 * src0_stride;                                            \
+          src1 += 2 * src1_stride;                                            \
+          dst += 2 * dst_stride;                                              \
+          h -= 2;                                                             \
+        } while (h != 0);                                                     \
+      }                                                                       \
+    }                                                                         \
+  }
+
+// 12 bitdepth
+HBD_BLEND_A64_D16_MASK(12, (ROUND0_BITS + 2))
+// 10 bitdepth
+HBD_BLEND_A64_D16_MASK(10, ROUND0_BITS)
+// 8 bitdepth
+HBD_BLEND_A64_D16_MASK(8, ROUND0_BITS)
+
+void aom_highbd_blend_a64_d16_mask_neon(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  (void)conv_params;
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  if (bd == 12) {
+    highbd_12_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else if (bd == 10) {
+    highbd_10_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, mask, mask_stride, w, h,
+                                      subw, subh);
+  } else {
+    highbd_8_blend_a64_d16_mask_neon(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h, subw,
+                                     subh);
+  }
+}
+
+void aom_highbd_blend_a64_mask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                    const uint8_t *src0_8, uint32_t src0_stride,
+                                    const uint8_t *src1_8, uint32_t src1_stride,
+                                    const uint8_t *mask, uint32_t mask_stride,
+                                    int w, int h, int subw, int subh, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if ((subw | subh) == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint16x8_t m0 = vmovl_u8(vld1_u8(mask + i));
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint16x8_t m0 = vmovl_u8(load_unaligned_u8_4x2(mask, mask_stride));
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1);
+
+        store_u16x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if ((subw & subh) == 1) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + 2 * i);
+          uint8x8_t m2 = vld1_u8(mask + 0 * mask_stride + 2 * i + 8);
+          uint8x8_t m3 = vld1_u8(mask + 1 * mask_stride + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg =
+              vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint8x8_t m2 = vld1_u8(mask + 2 * mask_stride);
+        uint8x8_t m3 = vld1_u8(mask + 3 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_u16x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else if (subw == 1 && subh == 0) {
+    if (w >= 8) {
+      do {
+        int i = 0;
+
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 2 * i);
+          uint8x8_t m1 = vld1_u8(mask + 2 * i + 8);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride);
+        uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_u16x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 2 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  } else {
+    if (w >= 8) {
+      do {
+        int i = 0;
+        do {
+          uint8x8_t m0 = vld1_u8(mask + 0 * mask_stride + i);
+          uint8x8_t m1 = vld1_u8(mask + 1 * mask_stride + i);
+          uint16x8_t s0 = vld1q_u16(src0 + i);
+          uint16x8_t s1 = vld1q_u16(src1 + i);
+
+          uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0, m1));
+          uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+          vst1q_u16(dst + i, blend);
+
+          i += 8;
+        } while (i < w);
+
+        mask += 2 * mask_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        dst += dst_stride;
+      } while (--h != 0);
+    } else {
+      do {
+        uint8x8_t m0_2 =
+            load_unaligned_u8_4x2(mask + 0 * mask_stride, 2 * mask_stride);
+        uint8x8_t m1_3 =
+            load_unaligned_u8_4x2(mask + 1 * mask_stride, 2 * mask_stride);
+        uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+        uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+        uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3));
+        uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1);
+
+        store_u16x4_strided_x2(dst, dst_stride, blend);
+
+        mask += 4 * mask_stride;
+        src0 += 2 * src0_stride;
+        src1 += 2 * src1_stride;
+        dst += 2 * dst_stride;
+        h -= 2;
+      } while (h != 0);
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
new file mode 100644
index 0000000000..1292e20342
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_blend_a64_vmask_neon.c
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/blend.h"
+
+void aom_highbd_blend_a64_vmask_neon(uint8_t *dst_8, uint32_t dst_stride,
+                                     const uint8_t *src0_8,
+                                     uint32_t src0_stride,
+                                     const uint8_t *src1_8,
+                                     uint32_t src1_stride, const uint8_t *mask,
+                                     int w, int h, int bd) {
+  (void)bd;
+
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (w >= 8) {
+    do {
+      uint16x8_t m = vmovl_u8(vdup_n_u8(mask[0]));
+      int i = 0;
+      do {
+        uint16x8_t s0 = vld1q_u16(src0 + i);
+        uint16x8_t s1 = vld1q_u16(src1 + i);
+
+        uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+        vst1q_u16(dst + i, blend);
+        i += 8;
+      } while (i < w);
+
+      mask += 1;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    } while (--h != 0);
+  } else if (w == 4) {
+    do {
+      uint16x4_t m1 = vdup_n_u16((uint16_t)mask[0]);
+      uint16x4_t m2 = vdup_n_u16((uint16_t)mask[1]);
+      uint16x8_t m = vcombine_u16(m1, m2);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src0, src0_stride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src1, src1_stride);
+
+      uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1);
+
+      store_u16x4_strided_x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else if (w == 2 && h >= 8) {
+    do {
+      uint16x4_t m0 = vdup_n_u16(0);
+      m0 = vld1_lane_u16((uint16_t *)mask, m0, 0);
+      uint8x8_t m0_zip =
+          vzip_u8(vreinterpret_u8_u16(m0), vreinterpret_u8_u16(m0)).val[0];
+      m0 = vget_low_u16(vmovl_u8(m0_zip));
+      uint16x4_t s0 = load_unaligned_u16_2x2(src0, src0_stride);
+      uint16x4_t s1 = load_unaligned_u16_2x2(src1, src1_stride);
+
+      uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1);
+
+      store_u16x2_strided_x2(dst, dst_stride, blend);
+
+      mask += 2;
+      src0 += 2 * src0_stride;
+      src1 += 2 * src1_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
new file mode 100644
index 0000000000..e25438c9b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_convolve8_neon.c
@@ -0,0 +1,363 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE int32x4_t highbd_convolve8_4_s32(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  int32x4_t sum = vmull_lane_s16(s0, y_filter_lo, 0);
+  sum = vmlal_lane_s16(sum, s1, y_filter_lo, 1);
+  sum = vmlal_lane_s16(sum, s2, y_filter_lo, 2);
+  sum = vmlal_lane_s16(sum, s3, y_filter_lo, 3);
+  sum = vmlal_lane_s16(sum, s4, y_filter_hi, 0);
+  sum = vmlal_lane_s16(sum, s5, y_filter_hi, 1);
+  sum = vmlal_lane_s16(sum, s6, y_filter_hi, 2);
+  sum = vmlal_lane_s16(sum, s7, y_filter_hi, 3);
+
+  return sum;
+}
+
+static INLINE uint16x4_t highbd_convolve8_4_s32_s16(
+    const int16x4_t s0, const int16x4_t s1, const int16x4_t s2,
+    const int16x4_t s3, const int16x4_t s4, const int16x4_t s5,
+    const int16x4_t s6, const int16x4_t s7, const int16x8_t y_filter) {
+  int32x4_t sum =
+      highbd_convolve8_4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE int32x4_t highbd_convolve8_horiz4_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  const int16x8_t s2 = vextq_s16(s0, s1, 1);
+  const int16x8_t s3 = vextq_s16(s0, s1, 2);
+  const int16x8_t s4 = vextq_s16(s0, s1, 3);
+  const int16x4_t s0_lo = vget_low_s16(s0);
+  const int16x4_t s1_lo = vget_low_s16(s2);
+  const int16x4_t s2_lo = vget_low_s16(s3);
+  const int16x4_t s3_lo = vget_low_s16(s4);
+  const int16x4_t s4_lo = vget_high_s16(s0);
+  const int16x4_t s5_lo = vget_high_s16(s2);
+  const int16x4_t s6_lo = vget_high_s16(s3);
+  const int16x4_t s7_lo = vget_high_s16(s4);
+
+  return highbd_convolve8_4_s32(s0_lo, s1_lo, s2_lo, s3_lo, s4_lo, s5_lo, s6_lo,
+                                s7_lo, x_filter_0_7);
+}
+
+static INLINE uint16x4_t highbd_convolve8_horiz4_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum = highbd_convolve8_horiz4_s32(s0, s1, x_filter_0_7);
+
+  return vqrshrun_n_s32(sum, FILTER_BITS);
+}
+
+static INLINE void highbd_convolve8_8_s32(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter,
+    int32x4_t *sum0, int32x4_t *sum1) {
+  const int16x4_t y_filter_lo = vget_low_s16(y_filter);
+  const int16x4_t y_filter_hi = vget_high_s16(y_filter);
+
+  *sum0 = vmull_lane_s16(vget_low_s16(s0), y_filter_lo, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s1), y_filter_lo, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s2), y_filter_lo, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s3), y_filter_lo, 3);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s4), y_filter_hi, 0);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s5), y_filter_hi, 1);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s6), y_filter_hi, 2);
+  *sum0 = vmlal_lane_s16(*sum0, vget_low_s16(s7), y_filter_hi, 3);
+
+  *sum1 = vmull_lane_s16(vget_high_s16(s0), y_filter_lo, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s1), y_filter_lo, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s2), y_filter_lo, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s3), y_filter_lo, 3);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s4), y_filter_hi, 0);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s5), y_filter_hi, 1);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s6), y_filter_hi, 2);
+  *sum1 = vmlal_lane_s16(*sum1, vget_high_s16(s7), y_filter_hi, 3);
+}
+
+static INLINE void highbd_convolve8_horiz8_s32(const int16x8_t s0,
+                                               const int16x8_t s0_hi,
+                                               const int16x8_t x_filter_0_7,
+                                               int32x4_t *sum0,
+                                               int32x4_t *sum1) {
+  const int16x8_t s1 = vextq_s16(s0, s0_hi, 1);
+  const int16x8_t s2 = vextq_s16(s0, s0_hi, 2);
+  const int16x8_t s3 = vextq_s16(s0, s0_hi, 3);
+  const int16x8_t s4 = vextq_s16(s0, s0_hi, 4);
+  const int16x8_t s5 = vextq_s16(s0, s0_hi, 5);
+  const int16x8_t s6 = vextq_s16(s0, s0_hi, 6);
+  const int16x8_t s7 = vextq_s16(s0, s0_hi, 7);
+
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_0_7, sum0,
+                         sum1);
+}
+
+static INLINE uint16x8_t highbd_convolve8_horiz8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t x_filter_0_7) {
+  int32x4_t sum0, sum1;
+  highbd_convolve8_horiz8_s32(s0, s1, x_filter_0_7, &sum0, &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static INLINE uint16x8_t highbd_convolve8_8_s32_s16(
+    const int16x8_t s0, const int16x8_t s1, const int16x8_t s2,
+    const int16x8_t s3, const int16x8_t s4, const int16x8_t s5,
+    const int16x8_t s6, const int16x8_t s7, const int16x8_t y_filter) {
+  int32x4_t sum0;
+  int32x4_t sum1;
+  highbd_convolve8_8_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter, &sum0,
+                         &sum1);
+
+  return vcombine_u16(vqrshrun_n_s32(sum0, FILTER_BITS),
+                      vqrshrun_n_s32(sum1, FILTER_BITS));
+}
+
+static void highbd_convolve_horiz_neon(const uint16_t *src_ptr,
+                                       ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                       ptrdiff_t dst_stride,
+                                       const int16_t *x_filter_ptr,
+                                       int x_step_q4, int w, int h, int bd) {
+  assert(w >= 4 && h >= 4);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+  const int16x8_t x_filter = vld1q_s16(x_filter_ptr);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    do {
+      int16x8_t s0, s1, s2, s3;
+      load_s16_8x2(s, src_stride, &s0, &s2);
+      load_s16_8x2(s + 8, src_stride, &s1, &s3);
+
+      uint16x4_t d0 = highbd_convolve8_horiz4_s32_s16(s0, s1, x_filter);
+      uint16x4_t d1 = highbd_convolve8_horiz4_s32_s16(s2, s3, x_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      d01 = vminq_u16(d01, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+
+      s += 2 * src_stride;
+      d += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    int height = h;
+
+    do {
+      int width = w;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+      int x_q4 = 0;
+
+      const int16_t *src_x = &s[x_q4 >> SUBPEL_BITS];
+      int16x8_t s0, s2, s4, s6;
+      load_s16_8x4(src_x, src_stride, &s0, &s2, &s4, &s6);
+      src_x += 8;
+
+      do {
+        int16x8_t s1, s3, s5, s7;
+        load_s16_8x4(src_x, src_stride, &s1, &s3, &s5, &s7);
+
+        uint16x8_t d0 = highbd_convolve8_horiz8_s32_s16(s0, s1, x_filter);
+        uint16x8_t d1 = highbd_convolve8_horiz8_s32_s16(s2, s3, x_filter);
+        uint16x8_t d2 = highbd_convolve8_horiz8_s32_s16(s4, s5, x_filter);
+        uint16x8_t d3 = highbd_convolve8_horiz8_s32_s16(s6, s7, x_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s1;
+        s2 = s3;
+        s4 = s5;
+        s6 = s7;
+        src_x += 8;
+        d += 8;
+        width -= 8;
+        x_q4 += 8 * x_step_q4;
+      } while (width > 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      height -= 4;
+    } while (height > 0);
+  }
+}
+
+void aom_highbd_convolve8_horiz_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                     uint8_t *dst8, ptrdiff_t dst_stride,
+                                     const int16_t *filter_x, int x_step_q4,
+                                     const int16_t *filter_y, int y_step_q4,
+                                     int w, int h, int bd) {
+  if (x_step_q4 != 16) {
+    aom_highbd_convolve8_horiz_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                 x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_y;
+    (void)y_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= SUBPEL_TAPS / 2 - 1;
+    highbd_convolve_horiz_neon(src, src_stride, dst, dst_stride, filter_x,
+                               x_step_q4, w, h, bd);
+  }
+}
+
+static void highbd_convolve_vert_neon(const uint16_t *src_ptr,
+                                      ptrdiff_t src_stride, uint16_t *dst_ptr,
+                                      ptrdiff_t dst_stride,
+                                      const int16_t *y_filter_ptr, int w, int h,
+                                      int bd) {
+  assert(w >= 4 && h >= 4);
+  const int16x8_t y_filter = vld1q_s16(y_filter_ptr);
+  const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);
+
+  if (w == 4) {
+    const int16_t *s = (const int16_t *)src_ptr;
+    uint16_t *d = dst_ptr;
+
+    int16x4_t s0, s1, s2, s3, s4, s5, s6;
+    load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    s += 7 * src_stride;
+
+    do {
+      int16x4_t s7, s8, s9, s10;
+      load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+      uint16x4_t d0 =
+          highbd_convolve8_4_s32_s16(s0, s1, s2, s3, s4, s5, s6, s7, y_filter);
+      uint16x4_t d1 =
+          highbd_convolve8_4_s32_s16(s1, s2, s3, s4, s5, s6, s7, s8, y_filter);
+      uint16x4_t d2 =
+          highbd_convolve8_4_s32_s16(s2, s3, s4, s5, s6, s7, s8, s9, y_filter);
+      uint16x4_t d3 =
+          highbd_convolve8_4_s32_s16(s3, s4, s5, s6, s7, s8, s9, s10, y_filter);
+
+      uint16x8_t d01 = vcombine_u16(d0, d1);
+      uint16x8_t d23 = vcombine_u16(d2, d3);
+
+      d01 = vminq_u16(d01, max);
+      d23 = vminq_u16(d23, max);
+
+      vst1_u16(d + 0 * dst_stride, vget_low_u16(d01));
+      vst1_u16(d + 1 * dst_stride, vget_high_u16(d01));
+      vst1_u16(d + 2 * dst_stride, vget_low_u16(d23));
+      vst1_u16(d + 3 * dst_stride, vget_high_u16(d23));
+
+      s0 = s4;
+      s1 = s5;
+      s2 = s6;
+      s3 = s7;
+      s4 = s8;
+      s5 = s9;
+      s6 = s10;
+      s += 4 * src_stride;
+      d += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    do {
+      int height = h;
+      const int16_t *s = (const int16_t *)src_ptr;
+      uint16_t *d = dst_ptr;
+
+      int16x8_t s0, s1, s2, s3, s4, s5, s6;
+      load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
+
+      do {
+        int16x8_t s7, s8, s9, s10;
+        load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10);
+
+        uint16x8_t d0 = highbd_convolve8_8_s32_s16(s0, s1, s2, s3, s4, s5, s6,
+                                                   s7, y_filter);
+        uint16x8_t d1 = highbd_convolve8_8_s32_s16(s1, s2, s3, s4, s5, s6, s7,
+                                                   s8, y_filter);
+        uint16x8_t d2 = highbd_convolve8_8_s32_s16(s2, s3, s4, s5, s6, s7, s8,
+                                                   s9, y_filter);
+        uint16x8_t d3 = highbd_convolve8_8_s32_s16(s3, s4, s5, s6, s7, s8, s9,
+                                                   s10, y_filter);
+
+        d0 = vminq_u16(d0, max);
+        d1 = vminq_u16(d1, max);
+        d2 = vminq_u16(d2, max);
+        d3 = vminq_u16(d3, max);
+
+        store_u16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s0 = s4;
+        s1 = s5;
+        s2 = s6;
+        s3 = s7;
+        s4 = s8;
+        s5 = s9;
+        s6 = s10;
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height > 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w > 0);
+  }
+}
+
+void aom_highbd_convolve8_vert_neon(const uint8_t *src8, ptrdiff_t src_stride,
+                                    uint8_t *dst8, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h, int bd) {
+  if (y_step_q4 != 16) {
+    aom_highbd_convolve8_vert_c(src8, src_stride, dst8, dst_stride, filter_x,
+                                x_step_q4, filter_y, y_step_q4, w, h, bd);
+  } else {
+    (void)filter_x;
+    (void)x_step_q4;
+
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+
+    src -= (SUBPEL_TAPS / 2 - 1) * src_stride;
+    highbd_convolve_vert_neon(src, src_stride, dst, dst_stride, filter_y, w, h,
+                              bd);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
new file mode 100644
index 0000000000..d28617c67e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_hadamard_neon.c
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  This source code is subject to the terms of the BSD 2 Clause License and
+ *  the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ *  was not distributed with this source code in the LICENSE file, you can
+ *  obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ *  Media Patent License 1.0 was not distributed with this source code in the
+ *  PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+
+static INLINE void hadamard_highbd_col8_first_pass(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  int16x8_t b0 = vaddq_s16(*a0, *a1);
+  int16x8_t b1 = vsubq_s16(*a0, *a1);
+  int16x8_t b2 = vaddq_s16(*a2, *a3);
+  int16x8_t b3 = vsubq_s16(*a2, *a3);
+  int16x8_t b4 = vaddq_s16(*a4, *a5);
+  int16x8_t b5 = vsubq_s16(*a4, *a5);
+  int16x8_t b6 = vaddq_s16(*a6, *a7);
+  int16x8_t b7 = vsubq_s16(*a6, *a7);
+
+  int16x8_t c0 = vaddq_s16(b0, b2);
+  int16x8_t c2 = vsubq_s16(b0, b2);
+  int16x8_t c1 = vaddq_s16(b1, b3);
+  int16x8_t c3 = vsubq_s16(b1, b3);
+  int16x8_t c4 = vaddq_s16(b4, b6);
+  int16x8_t c6 = vsubq_s16(b4, b6);
+  int16x8_t c5 = vaddq_s16(b5, b7);
+  int16x8_t c7 = vsubq_s16(b5, b7);
+
+  *a0 = vaddq_s16(c0, c4);
+  *a2 = vsubq_s16(c0, c4);
+  *a7 = vaddq_s16(c1, c5);
+  *a6 = vsubq_s16(c1, c5);
+  *a3 = vaddq_s16(c2, c6);
+  *a1 = vsubq_s16(c2, c6);
+  *a4 = vaddq_s16(c3, c7);
+  *a5 = vsubq_s16(c3, c7);
+}
+
+static INLINE void hadamard_highbd_col4_second_pass(int16x4_t a0, int16x4_t a1,
+                                                    int16x4_t a2, int16x4_t a3,
+                                                    int16x4_t a4, int16x4_t a5,
+                                                    int16x4_t a6, int16x4_t a7,
+                                                    tran_low_t *coeff) {
+  int32x4_t b0 = vaddl_s16(a0, a1);
+  int32x4_t b1 = vsubl_s16(a0, a1);
+  int32x4_t b2 = vaddl_s16(a2, a3);
+  int32x4_t b3 = vsubl_s16(a2, a3);
+  int32x4_t b4 = vaddl_s16(a4, a5);
+  int32x4_t b5 = vsubl_s16(a4, a5);
+  int32x4_t b6 = vaddl_s16(a6, a7);
+  int32x4_t b7 = vsubl_s16(a6, a7);
+
+  int32x4_t c0 = vaddq_s32(b0, b2);
+  int32x4_t c2 = vsubq_s32(b0, b2);
+  int32x4_t c1 = vaddq_s32(b1, b3);
+  int32x4_t c3 = vsubq_s32(b1, b3);
+  int32x4_t c4 = vaddq_s32(b4, b6);
+  int32x4_t c6 = vsubq_s32(b4, b6);
+  int32x4_t c5 = vaddq_s32(b5, b7);
+  int32x4_t c7 = vsubq_s32(b5, b7);
+
+  int32x4_t d0 = vaddq_s32(c0, c4);
+  int32x4_t d2 = vsubq_s32(c0, c4);
+  int32x4_t d7 = vaddq_s32(c1, c5);
+  int32x4_t d6 = vsubq_s32(c1, c5);
+  int32x4_t d3 = vaddq_s32(c2, c6);
+  int32x4_t d1 = vsubq_s32(c2, c6);
+  int32x4_t d4 = vaddq_s32(c3, c7);
+  int32x4_t d5 = vsubq_s32(c3, c7);
+
+  vst1q_s32(coeff + 0, d0);
+  vst1q_s32(coeff + 4, d1);
+  vst1q_s32(coeff + 8, d2);
+  vst1q_s32(coeff + 12, d3);
+  vst1q_s32(coeff + 16, d4);
+  vst1q_s32(coeff + 20, d5);
+  vst1q_s32(coeff + 24, d6);
+  vst1q_s32(coeff + 28, d7);
+}
+
+void aom_highbd_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  int16x4_t b0, b1, b2, b3, b4, b5, b6, b7;
+
+  int16x8_t s0 = vld1q_s16(src_diff + 0 * src_stride);
+  int16x8_t s1 = vld1q_s16(src_diff + 1 * src_stride);
+  int16x8_t s2 = vld1q_s16(src_diff + 2 * src_stride);
+  int16x8_t s3 = vld1q_s16(src_diff + 3 * src_stride);
+  int16x8_t s4 = vld1q_s16(src_diff + 4 * src_stride);
+  int16x8_t s5 = vld1q_s16(src_diff + 5 * src_stride);
+  int16x8_t s6 = vld1q_s16(src_diff + 6 * src_stride);
+  int16x8_t s7 = vld1q_s16(src_diff + 7 * src_stride);
+
+  // For the first pass we can stay in 16-bit elements (4095*8 = 32760).
+  hadamard_highbd_col8_first_pass(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+
+  // For the second pass we need to widen to 32-bit elements, so we're
+  // processing 4 columns at a time.
+  // Skip the second transpose because it is not required.
+
+  b0 = vget_low_s16(s0);
+  b1 = vget_low_s16(s1);
+  b2 = vget_low_s16(s2);
+  b3 = vget_low_s16(s3);
+  b4 = vget_low_s16(s4);
+  b5 = vget_low_s16(s5);
+  b6 = vget_low_s16(s6);
+  b7 = vget_low_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff);
+
+  b0 = vget_high_s16(s0);
+  b1 = vget_high_s16(s1);
+  b2 = vget_high_s16(s2);
+  b3 = vget_high_s16(s3);
+  b4 = vget_high_s16(s4);
+  b5 = vget_high_s16(s5);
+  b6 = vget_high_s16(s6);
+  b7 = vget_high_s16(s7);
+
+  hadamard_highbd_col4_second_pass(b0, b1, b2, b3, b4, b5, b6, b7, coeff + 32);
+}
+
+void aom_highbd_hadamard_16x16_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  // Rearrange 16x16 to 8x32 and remove stride.
+  // Top left first.
+  aom_highbd_hadamard_8x8_neon(src_diff, src_stride, coeff);
+  // Top right.
+  aom_highbd_hadamard_8x8_neon(src_diff + 8, src_stride, coeff + 64);
+  // Bottom left.
+  aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride, src_stride,
+                               coeff + 128);
+  // Bottom right.
+  aom_highbd_hadamard_8x8_neon(src_diff + 8 * src_stride + 8, src_stride,
+                               coeff + 192);
+
+  for (int i = 0; i < 16; i++) {
+    int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+    int32x4_t a1 = vld1q_s32(coeff + 4 * i + 64);
+    int32x4_t a2 = vld1q_s32(coeff + 4 * i + 128);
+    int32x4_t a3 = vld1q_s32(coeff + 4 * i + 192);
+
+    int32x4_t b0 = vhaddq_s32(a0, a1);
+    int32x4_t b1 = vhsubq_s32(a0, a1);
+    int32x4_t b2 = vhaddq_s32(a2, a3);
+    int32x4_t b3 = vhsubq_s32(a2, a3);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    vst1q_s32(coeff + 4 * i, c0);
+    vst1q_s32(coeff + 4 * i + 64, c1);
+    vst1q_s32(coeff + 4 * i + 128, c2);
+    vst1q_s32(coeff + 4 * i + 192, c3);
+  }
+}
+
+void aom_highbd_hadamard_32x32_neon(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  // Rearrange 32x32 to 16x64 and remove stride.
+  // Top left first.
+  aom_highbd_hadamard_16x16_neon(src_diff, src_stride, coeff);
+  // Top right.
+  aom_highbd_hadamard_16x16_neon(src_diff + 16, src_stride, coeff + 256);
+  // Bottom left.
+  aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride, src_stride,
+                                 coeff + 512);
+  // Bottom right.
+  aom_highbd_hadamard_16x16_neon(src_diff + 16 * src_stride + 16, src_stride,
+                                 coeff + 768);
+
+  for (int i = 0; i < 64; i++) {
+    int32x4_t a0 = vld1q_s32(coeff + 4 * i);
+    int32x4_t a1 = vld1q_s32(coeff + 4 * i + 256);
+    int32x4_t a2 = vld1q_s32(coeff + 4 * i + 512);
+    int32x4_t a3 = vld1q_s32(coeff + 4 * i + 768);
+
+    int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2);
+    int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2);
+    int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2);
+    int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2);
+
+    int32x4_t c0 = vaddq_s32(b0, b2);
+    int32x4_t c1 = vaddq_s32(b1, b3);
+    int32x4_t c2 = vsubq_s32(b0, b2);
+    int32x4_t c3 = vsubq_s32(b1, b3);
+
+    vst1q_s32(coeff + 4 * i, c0);
+    vst1q_s32(coeff + 4 * i + 256, c1);
+    vst1q_s32(coeff + 4 * i + 512, c2);
+    vst1q_s32(coeff + 4 * i + 768, c3);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
new file mode 100644
index 0000000000..dc47974c68
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_intrapred_neon.c
@@ -0,0 +1,2730 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "config/av1_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// DC
+
+static INLINE void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                       uint16x4_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1_u16(dst + i * stride, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                       uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+    vst1q_u16(dst + i * stride + 16, dc);
+    vst1q_u16(dst + i * stride + 24, dc);
+  }
+}
+
+static INLINE void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h,
+                                        uint16x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u16(dst + i * stride, dc);
+    vst1q_u16(dst + i * stride + 8, dc);
+    vst1q_u16(dst + i * stride + 16, dc);
+    vst1q_u16(dst + i * stride + 24, dc);
+    vst1q_u16(dst + i * stride + 32, dc);
+    vst1q_u16(dst + i * stride + 40, dc);
+    vst1q_u16(dst + i * stride + 48, dc);
+    vst1q_u16(dst + i * stride + 56, dc);
+  }
+}
+
+static INLINE uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) {
+  // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so
+  // promote first.
+  const uint32x4_t b = vpaddlq_u16(a);
+#if AOM_ARCH_AARCH64
+  const uint32x4_t c = vpaddq_u32(b, b);
+  return vpaddq_u32(c, c);
+#else
+  const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b));
+  const uint32x2_t d = vpadd_u32(c, c);
+  return vcombine_u32(d, d);
+#endif
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) {
+  // Nothing to do since sum is already one vector, but saves needing to
+  // special case w=4 or h=4 cases. The combine will be zero cost for a sane
+  // compiler since vld1 already sets the top half of a vector to zero as part
+  // of the operation.
+  return vcombine_u16(vld1_u16(left), vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) {
+  // Nothing to do since sum is already one vector, but saves needing to
+  // special case w=8 or h=8 cases.
+  return vld1q_u16(left);
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  return vaddq_u16(a0, a1);  // up to 13 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  const uint16x8_t a2 = vld1q_u16(left + 16);
+  const uint16x8_t a3 = vld1q_u16(left + 24);
+  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
+  const uint16x8_t b1 = vaddq_u16(a2, a3);
+  return vaddq_u16(b0, b1);  // up to 14 bits
+}
+
+static INLINE uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) {
+  const uint16x8_t a0 = vld1q_u16(left + 0);  // up to 12 bits
+  const uint16x8_t a1 = vld1q_u16(left + 8);
+  const uint16x8_t a2 = vld1q_u16(left + 16);
+  const uint16x8_t a3 = vld1q_u16(left + 24);
+  const uint16x8_t a4 = vld1q_u16(left + 32);
+  const uint16x8_t a5 = vld1q_u16(left + 40);
+  const uint16x8_t a6 = vld1q_u16(left + 48);
+  const uint16x8_t a7 = vld1q_u16(left + 56);
+  const uint16x8_t b0 = vaddq_u16(a0, a1);  // up to 13 bits
+  const uint16x8_t b1 = vaddq_u16(a2, a3);
+  const uint16x8_t b2 = vaddq_u16(a4, a5);
+  const uint16x8_t b3 = vaddq_u16(a6, a7);
+  const uint16x8_t c0 = vaddq_u16(b0, b1);  // up to 14 bits
+  const uint16x8_t c1 = vaddq_u16(b2, b3);
+  return vaddq_u16(c0, c1);  // up to 15 bits
+}
+
+#define HIGHBD_DC_PREDICTOR(w, h, shift)                               \
+  void aom_highbd_dc_predictor_##w##x##h##_neon(                       \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,          \
+      const uint16_t *left, int bd) {                                  \
+    (void)bd;                                                          \
+    const uint16x8_t a = highbd_dc_load_partial_sum_##w(above);        \
+    const uint16x8_t l = highbd_dc_load_partial_sum_##h(left);         \
+    const uint32x4_t sum =                                             \
+        horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l));      \
+    const uint16x4_t dc0 = vrshrn_n_u32(sum, shift);                   \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \
+  }
+
+void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  // In the rectangular cases we simply extend the shorter vector to uint16x8
+  // in order to accumulate, however in the 4x4 case there is no shorter vector
+  // to extend so it is beneficial to do the whole calculation in uint16x4
+  // instead.
+  (void)bd;
+  const uint16x4_t a = vld1_u16(above);  // up to 12 bits
+  const uint16x4_t l = vld1_u16(left);
+  uint16x4_t sum = vpadd_u16(a, l);  // up to 13 bits
+  sum = vpadd_u16(sum, sum);         // up to 14 bits
+  sum = vpadd_u16(sum, sum);
+  const uint16x4_t dc = vrshr_n_u16(sum, 3);
+  highbd_dc_store_4xh(dst, stride, 4, dc);
+}
+
+HIGHBD_DC_PREDICTOR(8, 8, 4)
+HIGHBD_DC_PREDICTOR(16, 16, 5)
+HIGHBD_DC_PREDICTOR(32, 32, 6)
+HIGHBD_DC_PREDICTOR(64, 64, 7)
+
+#undef HIGHBD_DC_PREDICTOR
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1,
+                                           uint32_t multiplier) {
+  return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier,
+                                     HIGHBD_DC_SHIFT2);
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+#define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult)                  \
+  void aom_highbd_dc_predictor_##w##x##h##_neon(                        \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,           \
+      const uint16_t *left, int bd) {                                   \
+    (void)bd;                                                           \
+    uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above);       \
+    uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left);         \
+    uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above);                \
+    int sum = horizontal_add_u16x8(sum_vec);                            \
+    int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0));    \
+  }
+
+HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4)
+HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2)
+
+#undef HIGHBD_DC_PREDICTOR_RECT
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+#define HIGHBD_DC_PREDICTOR_128(w, h, q)                        \
+  void aom_highbd_dc_128_predictor_##w##x##h##_neon(            \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)above;                                                \
+    (void)bd;                                                   \
+    (void)left;                                                 \
+    highbd_dc_store_##w##xh(dst, stride, (h),                   \
+                            vdup##q##_n_u16(0x80 << (bd - 8))); \
+  }
+
+HIGHBD_DC_PREDICTOR_128(4, 4, )
+HIGHBD_DC_PREDICTOR_128(4, 8, )
+HIGHBD_DC_PREDICTOR_128(4, 16, )
+HIGHBD_DC_PREDICTOR_128(8, 4, q)
+HIGHBD_DC_PREDICTOR_128(8, 8, q)
+HIGHBD_DC_PREDICTOR_128(8, 16, q)
+HIGHBD_DC_PREDICTOR_128(8, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 4, q)
+HIGHBD_DC_PREDICTOR_128(16, 8, q)
+HIGHBD_DC_PREDICTOR_128(16, 16, q)
+HIGHBD_DC_PREDICTOR_128(16, 32, q)
+HIGHBD_DC_PREDICTOR_128(16, 64, q)
+HIGHBD_DC_PREDICTOR_128(32, 8, q)
+HIGHBD_DC_PREDICTOR_128(32, 16, q)
+HIGHBD_DC_PREDICTOR_128(32, 32, q)
+HIGHBD_DC_PREDICTOR_128(32, 64, q)
+HIGHBD_DC_PREDICTOR_128(64, 16, q)
+HIGHBD_DC_PREDICTOR_128(64, 32, q)
+HIGHBD_DC_PREDICTOR_128(64, 64, q)
+
+#undef HIGHBD_DC_PREDICTOR_128
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+static INLINE uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) {
+  const uint16x4_t a = vld1_u16(left);   // up to 12 bits
+  const uint16x4_t b = vpadd_u16(a, a);  // up to 13 bits
+  return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) {
+  return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) {
+  return horizontal_add_and_broadcast_long_u16x8(
+      highbd_dc_load_partial_sum_16(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) {
+  return horizontal_add_and_broadcast_long_u16x8(
+      highbd_dc_load_partial_sum_32(left));
+}
+
+static INLINE uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) {
+  return horizontal_add_and_broadcast_long_u16x8(
+      highbd_dc_load_partial_sum_64(left));
+}
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q)                                  \
+  void aom_highbd_dc_left_predictor_##w##x##h##_neon(                      \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
+      const uint16_t *left, int bd) {                                      \
+    (void)above;                                                           \
+    (void)bd;                                                              \
+    const uint32x4_t sum = highbd_dc_load_sum_##h(left);                   \
+    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+  }
+
+DC_PREDICTOR_LEFT(4, 4, 2, )
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(8, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 8, 3, q)
+DC_PREDICTOR_LEFT(8, 16, 4, q)
+DC_PREDICTOR_LEFT(8, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 16, 4, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(64, 64, 6, q)
+
+#undef DC_PREDICTOR_LEFT
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+#define DC_PREDICTOR_TOP(w, h, shift, q)                                   \
+  void aom_highbd_dc_top_predictor_##w##x##h##_neon(                       \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
+      const uint16_t *left, int bd) {                                      \
+    (void)bd;                                                              \
+    (void)left;                                                            \
+    const uint32x4_t sum = highbd_dc_load_sum_##w(above);                  \
+    const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift));                     \
+    highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \
+  }
+
+DC_PREDICTOR_TOP(4, 4, 2, )
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, q)
+DC_PREDICTOR_TOP(8, 8, 3, q)
+DC_PREDICTOR_TOP(8, 16, 3, q)
+DC_PREDICTOR_TOP(8, 32, 3, q)
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 16, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 32, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+DC_PREDICTOR_TOP(64, 64, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+#define HIGHBD_V_NXM(W, H)                                    \
+  void aom_highbd_v_predictor_##W##x##H##_neon(               \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                         \
+    (void)left;                                               \
+    (void)bd;                                                 \
+    vertical##W##xh_neon(dst, stride, above, H);              \
+  }
+
+static INLINE uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) {
+  uint16x8x2_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  return x;
+}
+
+static INLINE void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) {
+  vst1q_u16(ptr, x.val[0]);
+  vst1q_u16(ptr + 8, x.val[1]);
+}
+
+static INLINE void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *const above, int height) {
+  const uint16x4_t row = vld1_u16(above);
+  int y = height;
+  do {
+    vst1_u16(dst, row);
+    vst1_u16(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *const above, int height) {
+  const uint16x8_t row = vld1q_u16(above);
+  int y = height;
+  do {
+    vst1q_u16(dst, row);
+    vst1q_u16(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  const uint16x8x2_t row = load_uint16x8x2(above);
+  int y = height;
+  do {
+    store_uint16x8x2(dst, row);
+    store_uint16x8x2(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) {
+  uint16x8x4_t x;
+  // Clang/gcc uses ldp here.
+  x.val[0] = vld1q_u16(ptr);
+  x.val[1] = vld1q_u16(ptr + 8);
+  x.val[2] = vld1q_u16(ptr + 16);
+  x.val[3] = vld1q_u16(ptr + 24);
+  return x;
+}
+
+static INLINE void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) {
+  vst1q_u16(ptr, x.val[0]);
+  vst1q_u16(ptr + 8, x.val[1]);
+  vst1q_u16(ptr + 16, x.val[2]);
+  vst1q_u16(ptr + 24, x.val[3]);
+}
+
+static INLINE void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  const uint16x8x4_t row = load_uint16x8x4(above);
+  int y = height;
+  do {
+    store_uint16x8x4(dst, row);
+    store_uint16x8x4(dst + stride, row);
+    dst += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+static INLINE void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const above, int height) {
+  uint16_t *dst32 = dst + 32;
+  const uint16x8x4_t row = load_uint16x8x4(above);
+  const uint16x8x4_t row32 = load_uint16x8x4(above + 32);
+  int y = height;
+  do {
+    store_uint16x8x4(dst, row);
+    store_uint16x8x4(dst32, row32);
+    store_uint16x8x4(dst + stride, row);
+    store_uint16x8x4(dst32 + stride, row32);
+    dst += stride << 1;
+    dst32 += stride << 1;
+    y -= 2;
+  } while (y != 0);
+}
+
+HIGHBD_V_NXM(4, 4)
+HIGHBD_V_NXM(4, 8)
+HIGHBD_V_NXM(4, 16)
+
+HIGHBD_V_NXM(8, 4)
+HIGHBD_V_NXM(8, 8)
+HIGHBD_V_NXM(8, 16)
+HIGHBD_V_NXM(8, 32)
+
+HIGHBD_V_NXM(16, 4)
+HIGHBD_V_NXM(16, 8)
+HIGHBD_V_NXM(16, 16)
+HIGHBD_V_NXM(16, 32)
+HIGHBD_V_NXM(16, 64)
+
+HIGHBD_V_NXM(32, 8)
+HIGHBD_V_NXM(32, 16)
+HIGHBD_V_NXM(32, 32)
+HIGHBD_V_NXM(32, 64)
+
+HIGHBD_V_NXM(64, 16)
+HIGHBD_V_NXM(64, 32)
+HIGHBD_V_NXM(64, 64)
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+static INLINE void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                      uint16x4_t left) {
+  vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0));
+  vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1));
+  vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2));
+  vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride,
+                                      uint16x4_t left) {
+  vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+}
+
+static INLINE void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+  vst1q_u16(dst + 16, left);
+  vst1q_u16(dst + 24, left);
+}
+
+static INLINE void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+static INLINE void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) {
+  vst1q_u16(dst + 0, left);
+  vst1q_u16(dst + 8, left);
+  vst1q_u16(dst + 16, left);
+  vst1q_u16(dst + 24, left);
+  vst1q_u16(dst + 32, left);
+  vst1q_u16(dst + 40, left);
+  vst1q_u16(dst + 48, left);
+  vst1q_u16(dst + 56, left);
+}
+
+static INLINE void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride,
+                                       uint16x4_t left) {
+  highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0));
+  highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1));
+  highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2));
+  highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3));
+}
+
+void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_4x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_8x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  highbd_h_store_16x4(dst, stride, vld1_u16(left));
+}
+
+void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  uint16x8_t l = vld1q_u16(left);
+  highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l));
+  highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l));
+}
+
+// For cases where height >= 16 we use pairs of loads to get LDP instructions.
+#define HIGHBD_H_WXH_LARGE(w, h)                                            \
+  void aom_highbd_h_predictor_##w##x##h##_neon(                             \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
+      const uint16_t *left, int bd) {                                       \
+    (void)above;                                                            \
+    (void)bd;                                                               \
+    for (int i = 0; i < (h) / 16; ++i) {                                    \
+      uint16x8_t l0 = vld1q_u16(left + 0);                                  \
+      uint16x8_t l1 = vld1q_u16(left + 8);                                  \
+      highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0));   \
+      highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0));  \
+      highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1));   \
+      highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \
+      left += 16;                                                           \
+      dst += 16 * stride;                                                   \
+    }                                                                       \
+  }
+
+HIGHBD_H_WXH_LARGE(4, 16)
+HIGHBD_H_WXH_LARGE(8, 16)
+HIGHBD_H_WXH_LARGE(8, 32)
+HIGHBD_H_WXH_LARGE(16, 16)
+HIGHBD_H_WXH_LARGE(16, 32)
+HIGHBD_H_WXH_LARGE(16, 64)
+HIGHBD_H_WXH_LARGE(32, 16)
+HIGHBD_H_WXH_LARGE(32, 32)
+HIGHBD_H_WXH_LARGE(32, 64)
+HIGHBD_H_WXH_LARGE(64, 16)
+HIGHBD_H_WXH_LARGE(64, 32)
+HIGHBD_H_WXH_LARGE(64, 64)
+
+#undef HIGHBD_H_WXH_LARGE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride,
+                                              const uint16_t *const top_row,
+                                              const uint16_t *const left_column,
+                                              int width, int height) {
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint16x8_t top;
+  if (width == 4) {
+    top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0));
+  } else {  // width == 8
+    top = vld1q_u16(top_row);
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint16x8_t left = vdupq_n_u16(left_column[y]);
+
+    const uint16x8_t left_dist = vabdq_u16(top, top_left);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddq_u16(top, left), top_left_x2);
+
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint16x8_t result = vbslq_u16(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbslq_u16(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      vst1_u16(dest, vget_low_u16(result));
+    } else {  // width == 8
+      vst1q_u16(dest, result);
+    }
+    dest += stride;
+  }
+}
+
+#define HIGHBD_PAETH_NXM(W, H)                                  \
+  void aom_highbd_paeth_predictor_##W##x##H##_neon(             \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,   \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \
+  }
+
+HIGHBD_PAETH_NXM(4, 4)
+HIGHBD_PAETH_NXM(4, 8)
+HIGHBD_PAETH_NXM(4, 16)
+HIGHBD_PAETH_NXM(8, 4)
+HIGHBD_PAETH_NXM(8, 8)
+HIGHBD_PAETH_NXM(8, 16)
+HIGHBD_PAETH_NXM(8, 32)
+
+// Select the closest values and collect them.
+static INLINE uint16x8_t select_paeth(const uint16x8_t top,
+                                      const uint16x8_t left,
+                                      const uint16x8_t top_left,
+                                      const uint16x8_t left_le_top,
+                                      const uint16x8_t left_le_top_left,
+                                      const uint16x8_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  const uint16x8_t result = vbslq_u16(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u16(left_or_top_mask, result, top_left);
+}
+
+#define PAETH_PREDICTOR(num)                                                  \
+  do {                                                                        \
+    const uint16x8_t left_dist = vabdq_u16(top[num], top_left);               \
+    const uint16x8_t top_left_dist =                                          \
+        vabdq_u16(vaddq_u16(top[num], left), top_left_x2);                    \
+    const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist);            \
+    const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist);  \
+    const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist);    \
+    const uint16x8_t result =                                                 \
+        select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \
+                     top_le_top_left);                                        \
+    vst1q_u16(dest + (num * 8), result);                                      \
+  } while (0)
+
+#define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8))
+
+static INLINE void highbd_paeth16_plus_x_h_neon(
+    uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row,
+    const uint16_t *const left_column, int width, int height) {
+  const uint16x8_t top_left = vdupq_n_u16(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint16x8_t top[8];
+  top[0] = LOAD_TOP_ROW(0);
+  top[1] = LOAD_TOP_ROW(1);
+  if (width > 16) {
+    top[2] = LOAD_TOP_ROW(2);
+    top[3] = LOAD_TOP_ROW(3);
+    if (width == 64) {
+      top[4] = LOAD_TOP_ROW(4);
+      top[5] = LOAD_TOP_ROW(5);
+      top[6] = LOAD_TOP_ROW(6);
+      top[7] = LOAD_TOP_ROW(7);
+    }
+  }
+
+  for (int y = 0; y < height; ++y) {
+    const uint16x8_t left = vdupq_n_u16(left_column[y]);
+    const uint16x8_t top_dist = vabdq_u16(left, top_left);
+    PAETH_PREDICTOR(0);
+    PAETH_PREDICTOR(1);
+    if (width > 16) {
+      PAETH_PREDICTOR(2);
+      PAETH_PREDICTOR(3);
+      if (width == 64) {
+        PAETH_PREDICTOR(4);
+        PAETH_PREDICTOR(5);
+        PAETH_PREDICTOR(6);
+        PAETH_PREDICTOR(7);
+      }
+    }
+    dest += stride;
+  }
+}
+
+#define HIGHBD_PAETH_NXM_WIDE(W, H)                               \
+  void aom_highbd_paeth_predictor_##W##x##H##_neon(               \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,     \
+      const uint16_t *left, int bd) {                             \
+    (void)bd;                                                     \
+    highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \
+  }
+
+HIGHBD_PAETH_NXM_WIDE(16, 4)
+HIGHBD_PAETH_NXM_WIDE(16, 8)
+HIGHBD_PAETH_NXM_WIDE(16, 16)
+HIGHBD_PAETH_NXM_WIDE(16, 32)
+HIGHBD_PAETH_NXM_WIDE(16, 64)
+HIGHBD_PAETH_NXM_WIDE(32, 8)
+HIGHBD_PAETH_NXM_WIDE(32, 16)
+HIGHBD_PAETH_NXM_WIDE(32, 32)
+HIGHBD_PAETH_NXM_WIDE(32, 64)
+HIGHBD_PAETH_NXM_WIDE(64, 16)
+HIGHBD_PAETH_NXM_WIDE(64, 32)
+HIGHBD_PAETH_NXM_WIDE(64, 64)
+
+// -----------------------------------------------------------------------------
+// SMOOTH
+
+// 256 - v = vneg_s8(v)
+static INLINE uint16x4_t negate_s8(const uint16x4_t v) {
+  return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v)));
+}
+
+static INLINE void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *const top_row,
+                                          const uint16_t *const left_column,
+                                          const int height) {
+  const uint16_t top_right = top_row[3];
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_v = vld1_u16(top_row);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16);
+  const uint16x4_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+
+  for (int y = 0; y < height; ++y) {
+    // Each variable in the running summation is named for the last item to be
+    // accumulated.
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_tr, top_v, weights_y[y]);
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_top, weights_x_v, left_column[y]);
+    const uint32x4_t weighted_bl =
+        vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]);
+
+    const uint16x4_t pred =
+        vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+    vst1_u16(dst, pred);
+    dst += stride;
+  }
+}
+
+// Common code between 8xH and [16|32|64]xH.
+static INLINE void highbd_calculate_pred8(
+    uint16_t *dst, const uint32x4_t weighted_corners_low,
+    const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals,
+    const uint16x4x2_t weights_x, const uint16_t left_y,
+    const uint16_t weight_y) {
+  // Each variable in the running summation is named for the last item to be
+  // accumulated.
+  const uint32x4_t weighted_top_low =
+      vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y);
+  const uint32x4_t weighted_edges_low =
+      vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y);
+
+  const uint16x4_t pred_low =
+      vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+  vst1_u16(dst, pred_low);
+
+  const uint32x4_t weighted_top_high =
+      vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y);
+  const uint32x4_t weighted_edges_high =
+      vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y);
+
+  const uint16x4_t pred_high =
+      vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1);
+  vst1_u16(dst + 4, pred_high);
+}
+
+static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *const top_row,
+                                   const uint16_t *const left_column,
+                                   const int height) {
+  const uint16_t top_right = top_row[7];
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4x2_t top_vals = { { vld1_u16(top_row),
+                                    vld1_u16(top_row + 4) } };
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+                                     vld1_u16(smooth_weights_u16 + 8) } };
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_corners_low =
+        vaddq_u32(weighted_bl, weighted_tr_low);
+    const uint32x4_t weighted_corners_high =
+        vaddq_u32(weighted_bl, weighted_tr_high);
+    highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high,
+                           top_vals, weights_x, left_column[y], weights_y[y]);
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_NXM(W, H)                                 \
+  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
+  }
+
+HIGHBD_SMOOTH_NXM(4, 4)
+HIGHBD_SMOOTH_NXM(4, 8)
+HIGHBD_SMOOTH_NXM(8, 4)
+HIGHBD_SMOOTH_NXM(8, 8)
+HIGHBD_SMOOTH_NXM(4, 16)
+HIGHBD_SMOOTH_NXM(8, 16)
+HIGHBD_SMOOTH_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_PREDICTOR(W)                                             \
+  static void highbd_smooth_##W##xh_neon(                                      \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,          \
+      const uint16_t *const left_column, const int height) {                   \
+    const uint16_t top_right = top_row[(W)-1];                                 \
+    const uint16_t bottom_left = left_column[height - 1];                      \
+    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;         \
+                                                                               \
+    /* Precompute weighted values that don't vary with |y|. */                 \
+    uint32x4_t weighted_tr_low[(W) >> 3];                                      \
+    uint32x4_t weighted_tr_high[(W) >> 3];                                     \
+    for (int i = 0; i < (W) >> 3; ++i) {                                       \
+      const int x = i << 3;                                                    \
+      const uint16x4_t weights_x_low =                                         \
+          vld1_u16(smooth_weights_u16 + (W)-4 + x);                            \
+      weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right);   \
+      const uint16x4_t weights_x_high =                                        \
+          vld1_u16(smooth_weights_u16 + (W) + x);                              \
+      weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \
+    }                                                                          \
+                                                                               \
+    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                  \
+    for (int y = 0; y < height; ++y) {                                         \
+      const uint32x4_t weighted_bl =                                           \
+          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                      \
+      uint16_t *dst_x = dst;                                                   \
+      for (int i = 0; i < (W) >> 3; ++i) {                                     \
+        const int x = i << 3;                                                  \
+        const uint16x4x2_t top_vals = { { vld1_u16(top_row + x),               \
+                                          vld1_u16(top_row + x + 4) } };       \
+        const uint32x4_t weighted_corners_low =                                \
+            vaddq_u32(weighted_bl, weighted_tr_low[i]);                        \
+        const uint32x4_t weighted_corners_high =                               \
+            vaddq_u32(weighted_bl, weighted_tr_high[i]);                       \
+        /* Accumulate weighted edge values and store. */                       \
+        const uint16x4x2_t weights_x = {                                       \
+          { vld1_u16(smooth_weights_u16 + (W)-4 + x),                          \
+            vld1_u16(smooth_weights_u16 + (W) + x) }                           \
+        };                                                                     \
+        highbd_calculate_pred8(dst_x, weighted_corners_low,                    \
+                               weighted_corners_high, top_vals, weights_x,     \
+                               left_column[y], weights_y[y]);                  \
+        dst_x += 8;                                                            \
+      }                                                                        \
+      dst += stride;                                                           \
+    }                                                                          \
+  }
+
+HIGHBD_SMOOTH_PREDICTOR(16)
+HIGHBD_SMOOTH_PREDICTOR(32)
+HIGHBD_SMOOTH_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_PREDICTOR
+
+#define HIGHBD_SMOOTH_NXM_WIDE(W, H)                            \
+  void aom_highbd_smooth_predictor_##W##x##H##_neon(            \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \
+      const uint16_t *left, int bd) {                           \
+    (void)bd;                                                   \
+    highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H);  \
+  }
+
+HIGHBD_SMOOTH_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_NXM_WIDE
+
+static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *const top_row,
+                                     const uint16_t *const left_column,
+                                     const int height) {
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_v = vld1_u16(top_row);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+    const uint32x4_t weighted_top =
+        vmlal_n_u16(weighted_bl, top_v, weights_y[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    dst += stride;
+  }
+}
+
+static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride,
+                                     const uint16_t *const top_row,
+                                     const uint16_t *const left_column,
+                                     const int height) {
+  const uint16_t bottom_left = left_column[height - 1];
+  const uint16_t *const weights_y = smooth_weights_u16 + height - 4;
+
+  const uint16x4_t top_low = vld1_u16(top_row);
+  const uint16x4_t top_high = vld1_u16(top_row + 4);
+  const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);
+
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_bl =
+        vmull_n_u16(bottom_left_v, 256 - weights_y[y]);
+
+    const uint32x4_t weighted_top_low =
+        vmlal_n_u16(weighted_bl, top_low, weights_y[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    const uint32x4_t weighted_top_high =
+        vmlal_n_u16(weighted_bl, top_high, weights_y[y]);
+    vst1_u16(dst + 4,
+             vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_V_NXM(W, H)                                \
+  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_V_NXM(4, 4)
+HIGHBD_SMOOTH_V_NXM(4, 8)
+HIGHBD_SMOOTH_V_NXM(4, 16)
+HIGHBD_SMOOTH_V_NXM(8, 4)
+HIGHBD_SMOOTH_V_NXM(8, 8)
+HIGHBD_SMOOTH_V_NXM(8, 16)
+HIGHBD_SMOOTH_V_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_V_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_V_PREDICTOR(W)                                         \
+  static void highbd_smooth_v_##W##xh_neon(                                  \
+      uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row,  \
+      const uint16_t *const left_column, const int height) {                 \
+    const uint16_t bottom_left = left_column[height - 1];                    \
+    const uint16_t *const weights_y = smooth_weights_u16 + height - 4;       \
+                                                                             \
+    uint16x4x2_t top_vals[(W) >> 3];                                         \
+    for (int i = 0; i < (W) >> 3; ++i) {                                     \
+      const int x = i << 3;                                                  \
+      top_vals[i].val[0] = vld1_u16(top_row + x);                            \
+      top_vals[i].val[1] = vld1_u16(top_row + x + 4);                        \
+    }                                                                        \
+                                                                             \
+    const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left);                \
+    for (int y = 0; y < height; ++y) {                                       \
+      const uint32x4_t weighted_bl =                                         \
+          vmull_n_u16(bottom_left_v, 256 - weights_y[y]);                    \
+                                                                             \
+      uint16_t *dst_x = dst;                                                 \
+      for (int i = 0; i < (W) >> 3; ++i) {                                   \
+        const uint32x4_t weighted_top_low =                                  \
+            vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]);      \
+        vst1_u16(dst_x,                                                      \
+                 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
+                                                                             \
+        const uint32x4_t weighted_top_high =                                 \
+            vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]);      \
+        vst1_u16(dst_x + 4,                                                  \
+                 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+        dst_x += 8;                                                          \
+      }                                                                      \
+      dst += stride;                                                         \
+    }                                                                        \
+  }
+
+HIGHBD_SMOOTH_V_PREDICTOR(16)
+HIGHBD_SMOOTH_V_PREDICTOR(32)
+HIGHBD_SMOOTH_V_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_V_PREDICTOR
+
+#define HIGHBD_SMOOTH_V_NXM_WIDE(W, H)                           \
+  void aom_highbd_smooth_v_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_V_NXM_WIDE
+
+static INLINE void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *const top_row,
+                                            const uint16_t *const left_column,
+                                            const int height) {
+  const uint16_t top_right = top_row[3];
+
+  const uint16x4_t weights_x = vld1_u16(smooth_weights_u16);
+  const uint16x4_t scaled_weights_x = negate_s8(weights_x);
+
+  const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right);
+  for (int y = 0; y < height; ++y) {
+    const uint32x4_t weighted_left =
+        vmlal_n_u16(weighted_tr, weights_x, left_column[y]);
+    vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *const top_row,
+                                            const uint16_t *const left_column,
+                                            const int height) {
+  const uint16_t top_right = top_row[7];
+
+  const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4),
+                                     vld1_u16(smooth_weights_u16 + 8) } };
+
+  const uint32x4_t weighted_tr_low =
+      vmull_n_u16(negate_s8(weights_x.val[0]), top_right);
+  const uint32x4_t weighted_tr_high =
+      vmull_n_u16(negate_s8(weights_x.val[1]), top_right);
+
+  for (int y = 0; y < height; ++y) {
+    const uint16_t left_y = left_column[y];
+    const uint32x4_t weighted_left_low =
+        vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y);
+    vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));
+
+    const uint32x4_t weighted_left_high =
+        vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y);
+    vst1_u16(dst + 4,
+             vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE));
+    dst += stride;
+  }
+}
+
+#define HIGHBD_SMOOTH_H_NXM(W, H)                                \
+  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_H_NXM(4, 4)
+HIGHBD_SMOOTH_H_NXM(4, 8)
+HIGHBD_SMOOTH_H_NXM(4, 16)
+HIGHBD_SMOOTH_H_NXM(8, 4)
+HIGHBD_SMOOTH_H_NXM(8, 8)
+HIGHBD_SMOOTH_H_NXM(8, 16)
+HIGHBD_SMOOTH_H_NXM(8, 32)
+
+#undef HIGHBD_SMOOTH_H_NXM
+
+// For width 16 and above.
+#define HIGHBD_SMOOTH_H_PREDICTOR(W)                                          \
+  void highbd_smooth_h_##W##xh_neon(                                          \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row,         \
+      const uint16_t *const left_column, const int height) {                  \
+    const uint16_t top_right = top_row[(W)-1];                                \
+                                                                              \
+    uint16x4_t weights_x_low[(W) >> 3];                                       \
+    uint16x4_t weights_x_high[(W) >> 3];                                      \
+    uint32x4_t weighted_tr_low[(W) >> 3];                                     \
+    uint32x4_t weighted_tr_high[(W) >> 3];                                    \
+    for (int i = 0; i < (W) >> 3; ++i) {                                      \
+      const int x = i << 3;                                                   \
+      weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W)-4 + x);            \
+      weighted_tr_low[i] =                                                    \
+          vmull_n_u16(negate_s8(weights_x_low[i]), top_right);                \
+      weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x);             \
+      weighted_tr_high[i] =                                                   \
+          vmull_n_u16(negate_s8(weights_x_high[i]), top_right);               \
+    }                                                                         \
+                                                                              \
+    for (int y = 0; y < height; ++y) {                                        \
+      uint16_t *dst_x = dst;                                                  \
+      const uint16_t left_y = left_column[y];                                 \
+      for (int i = 0; i < (W) >> 3; ++i) {                                    \
+        const uint32x4_t weighted_left_low =                                  \
+            vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y);        \
+        vst1_u16(dst_x,                                                       \
+                 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE));  \
+                                                                              \
+        const uint32x4_t weighted_left_high =                                 \
+            vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y);      \
+        vst1_u16(dst_x + 4,                                                   \
+                 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \
+        dst_x += 8;                                                           \
+      }                                                                       \
+      dst += stride;                                                          \
+    }                                                                         \
+  }
+
+HIGHBD_SMOOTH_H_PREDICTOR(16)
+HIGHBD_SMOOTH_H_PREDICTOR(32)
+HIGHBD_SMOOTH_H_PREDICTOR(64)
+
+#undef HIGHBD_SMOOTH_H_PREDICTOR
+
+#define HIGHBD_SMOOTH_H_NXM_WIDE(W, H)                           \
+  void aom_highbd_smooth_h_predictor_##W##x##H##_neon(           \
+      uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above,  \
+      const uint16_t *left, int bd) {                            \
+    (void)bd;                                                    \
+    highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \
+  }
+
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 4)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(16, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 8)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(32, 64)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 16)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 32)
+HIGHBD_SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef HIGHBD_SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// Z1
+
+static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 };
+static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 };
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0,
+                                                               uint16x4_t a1,
+                                                               int shift) {
+  // The C implementation of the z1 predictor uses (32 - shift) and a right
+  // shift by 5, however we instead double shift to avoid an unnecessary right
+  // shift by 1.
+  uint32x4_t res = vmull_n_u16(a1, shift);
+  res = vmlal_n_u16(res, a0, 64 - shift);
+  return vrshrn_n_u32(res, 6);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0,
+                                                               uint16x8_t a1,
+                                                               int shift) {
+  return vcombine_u16(
+      highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift),
+      highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift));
+}
+
+static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst,
+                                                   ptrdiff_t stride, int bw,
+                                                   int bh,
+                                                   const uint16_t *above,
+                                                   int dx) {
+  assert(bw % 4 == 0);
+  assert(bh % 4 == 0);
+  assert(dx > 0);
+
+  const int max_base_x = (bw + bh) - 1;
+  const int above_max = above[max_base_x];
+
+  const int16x8_t iota1x8 = vld1q_s16(iota1_s16);
+  const int16x4_t iota1x4 = vget_low_s16(iota1x8);
+
+  int x = dx;
+  int r = 0;
+  do {
+    const int base = x >> 6;
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        aom_memset16(dst, above_max, bw);
+        dst += stride;
+      }
+      return;
+    }
+
+    // The C implementation of the z1 predictor when not upsampling uses:
+    // ((x & 0x3f) >> 1)
+    // The right shift is unnecessary here since we instead shift by +1 later,
+    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+    const int shift = x & 0x3e;
+
+    if (bw == 4) {
+      const uint16x4_t a0 = vld1_u16(&above[base]);
+      const uint16x4_t a1 = vld1_u16(&above[base + 1]);
+      const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift);
+      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4);
+      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+      vst1_u16(dst, res);
+    } else {
+      int c = 0;
+      do {
+        const uint16x8_t a0 = vld1q_u16(&above[base + c]);
+        const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]);
+        const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift);
+        const uint16x8_t cmp =
+            vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8);
+        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+        vst1q_u16(dst + c, res);
+        c += 8;
+      } while (c < bw);
+    }
+
+    dst += stride;
+    x += dx;
+  } while (++r < bh);
+}
+
+static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst,
+                                                   ptrdiff_t stride, int bw,
+                                                   int bh,
+                                                   const uint16_t *above,
+                                                   int dx) {
+  assert(bw % 4 == 0);
+  assert(bh % 4 == 0);
+  assert(dx > 0);
+
+  const int max_base_x = ((bw + bh) - 1) << 1;
+  const int above_max = above[max_base_x];
+
+  const int16x8_t iota2x8 = vld1q_s16(iota2_s16);
+  const int16x4_t iota2x4 = vget_low_s16(iota2x8);
+
+  int x = dx;
+  int r = 0;
+  do {
+    const int base = x >> 5;
+    if (base >= max_base_x) {
+      for (int i = r; i < bh; ++i) {
+        aom_memset16(dst, above_max, bw);
+        dst += stride;
+      }
+      return;
+    }
+
+    // The C implementation of the z1 predictor when upsampling uses:
+    // (((x << 1) & 0x3f) >> 1)
+    // The right shift is unnecessary here since we instead shift by +1 later,
+    // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+    const int shift = (x << 1) & 0x3e;
+
+    if (bw == 4) {
+      const uint16x4x2_t a01 = vld2_u16(&above[base]);
+      const uint16x4_t val =
+          highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift);
+      const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4);
+      const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max));
+      vst1_u16(dst, res);
+    } else {
+      int c = 0;
+      do {
+        const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]);
+        const uint16x8_t val =
+            highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift);
+        const uint16x8_t cmp =
+            vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8);
+        const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max));
+        vst1q_u16(dst + c, res);
+        c += 8;
+      } while (c < bw);
+    }
+
+    dst += stride;
+    x += dx;
+  } while (++r < bh);
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int dx, int dy, int bd) {
+  (void)left;
+  (void)dy;
+  (void)bd;
+  assert(dy == 1);
+
+  if (upsample_above) {
+    highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx);
+  } else {
+    highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// Z2
+
+#if AOM_ARCH_AARCH64
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3
+// Y0, X0, X1, X2
+// Y0, Y1, X0, X1
+// Y0, Y1, Y2, X0
+// Y0, Y1, Y2, Y3
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x4[5][8] = {
+  {  8,  9, 10, 11, 12, 13, 14, 15 },
+  {  0,  1,  8,  9, 10, 11, 12, 13 },
+  {  0,  1,  2,  3,  8,  9, 10, 11 },
+  {  0,  1,  2,  3,  4,  5,  8,  9 },
+  {  0,  1,  2,  3,  4,  5,  6,  7 },
+};
+// clang-format on
+
+// Incrementally shift more elements from `above` into the result, merging with
+// existing `left` elements.
+// X0, X1, X2, X3, X4, X5, X6, X7
+// Y0, X0, X1, X2, X3, X4, X5, X6
+// Y0, Y1, X0, X1, X2, X3, X4, X5
+// Y0, Y1, Y2, X0, X1, X2, X3, X4
+// Y0, Y1, Y2, Y3, X0, X1, X2, X3
+// Y0, Y1, Y2, Y3, Y4, X0, X1, X2
+// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0
+// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7
+// clang-format off
+static const uint8_t z2_merge_shuffles_u16x8[9][16] = {
+  { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 },
+  {  0,  1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 },
+  {  0,  1,  2,  3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 },
+  {  0,  1,  2,  3,  4,  5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 },
+  {  0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23 },
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 16, 17, 18, 19, 20, 21 },
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 16, 17, 18, 19 },
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 16, 17 },
+  {  0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x4[5][4] = {
+  {      0U,      0U,      0U,      0U },
+  { 0xffffU,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+// clang-format off
+static const uint16_t z2_y_iter_masks_u16x8[9][8] = {
+  {      0U,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
+  { 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU,      0U },
+  { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU },
+};
+// clang-format on
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8(
+    const uint16x8_t left_data, const int16x4_t indices, int base, int n) {
+  // Need to adjust indices to operate on 0-based indices rather than
+  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+  // indices so we can use a tbl instruction (which only operates on bytes).
+  uint8x8_t left_indices =
+      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+  left_indices = vtrn1_u8(left_indices, left_indices);
+  left_indices = vadd_u8(left_indices, left_indices);
+  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+  const uint16x4_t ret = vreinterpret_u16_u8(
+      vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices));
+  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+    const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) {
+  // Need to adjust indices to operate on 0-based indices rather than
+  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+  // indices so we can use a tbl instruction (which only operates on bytes).
+  uint8x8_t left_indices =
+      vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base)));
+  left_indices = vtrn1_u8(left_indices, left_indices);
+  left_indices = vadd_u8(left_indices, left_indices);
+  left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100)));
+  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+                             vreinterpretq_u8_u16(left_data.val[1]) } };
+  const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices));
+  return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+    const uint16x8_t left_data, const int16x8_t indices, int base, int n) {
+  // Need to adjust indices to operate on 0-based indices rather than
+  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+  // indices so we can use a tbl instruction (which only operates on bytes).
+  uint8x16_t left_indices =
+      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+  left_indices = vtrn1q_u8(left_indices, left_indices);
+  left_indices = vaddq_u8(left_indices, left_indices);
+  left_indices =
+      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+  const uint16x8_t ret = vreinterpretq_u16_u8(
+      vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices));
+  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+    const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) {
+  // Need to adjust indices to operate on 0-based indices rather than
+  // `base`-based indices and then adjust from uint16x4 indices to uint8x8
+  // indices so we can use a tbl instruction (which only operates on bytes).
+  uint8x16_t left_indices =
+      vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base)));
+  left_indices = vtrn1q_u8(left_indices, left_indices);
+  left_indices = vaddq_u8(left_indices, left_indices);
+  left_indices =
+      vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100)));
+  uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]),
+                             vreinterpretq_u8_u16(left_data.val[1]) } };
+  const uint16x8_t ret =
+      vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices));
+  return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n]));
+}
+#endif  // AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4(
+    const uint16_t *left, const int16x4_t indices, int n) {
+  assert(n > 0);
+  assert(n <= 4);
+  // Load two elements at a time and then uzp them into separate vectors, to
+  // reduce the number of memory accesses.
+  uint32x2_t ret0_u32 = vdup_n_u32(0);
+  uint32x2_t ret1_u32 = vdup_n_u32(0);
+
+  // Use a single vget_lane_u64 to minimize vector to general purpose register
+  // transfers and then mask off the bits we actually want.
+  const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0);
+  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+
+  // At time of writing both Clang and GCC produced better code with these
+  // nested if-statements compared to a switch statement with fallthrough.
+  ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  if (n > 1) {
+    ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    if (n > 2) {
+      ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0);
+      if (n > 3) {
+        ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1);
+      }
+    }
+  }
+  return vuzp_u16(vreinterpret_u16_u32(ret0_u32),
+                  vreinterpret_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8(
+    const uint16_t *left, const int16x8_t indices, int n) {
+  assert(n > 0);
+  assert(n <= 8);
+  // Load two elements at a time and then uzp them into separate vectors, to
+  // reduce the number of memory accesses.
+  uint32x4_t ret0_u32 = vdupq_n_u32(0);
+  uint32x4_t ret1_u32 = vdupq_n_u32(0);
+
+  // Use a pair of vget_lane_u64 to minimize vector to general purpose register
+  // transfers and then mask off the bits we actually want.
+  const uint64_t indices0123 =
+      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0);
+  const uint64_t indices4567 =
+      vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1);
+  const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU);
+  const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU);
+  const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU);
+  const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU);
+  const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU);
+  const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU);
+  const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU);
+  const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU);
+
+  // At time of writing both Clang and GCC produced better code with these
+  // nested if-statements compared to a switch statement with fallthrough.
+  ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0);
+  if (n > 1) {
+    ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1);
+    if (n > 2) {
+      ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2);
+      if (n > 3) {
+        ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3);
+        if (n > 4) {
+          ret1_u32 =
+              vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0);
+          if (n > 5) {
+            ret1_u32 =
+                vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1);
+            if (n > 6) {
+              ret1_u32 =
+                  vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2);
+              if (n > 7) {
+                ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7),
+                                          ret1_u32, 3);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32),
+                   vreinterpretq_u16_u32(ret1_u32));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4(
+    uint16x4_t out_x, uint16x4_t out_y, int base_shift) {
+  assert(base_shift >= 0);
+  assert(base_shift <= 4);
+  // On AArch64 we can permute the data from the `above` and `left` vectors
+  // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+  const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y),
+                                 vreinterpret_u8_u16(out_x) } };
+  return vreinterpret_u16_u8(
+      vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift])));
+#else
+  uint16x4_t out = out_y;
+  for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) {
+    out[c2] = out_x[x_idx];
+  }
+  return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8(
+    uint16x8_t out_x, uint16x8_t out_y, int base_shift) {
+  assert(base_shift >= 0);
+  assert(base_shift <= 8);
+  // On AArch64 we can permute the data from the `above` and `left` vectors
+  // into a single vector in a single load (of the permute vector) + tbl.
+#if AOM_ARCH_AARCH64
+  const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y),
+                                  vreinterpretq_u8_u16(out_x) } };
+  return vreinterpretq_u16_u8(
+      vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift])));
+#else
+  uint16x8_t out = out_y;
+  for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) {
+    out[c2] = out_x[x_idx];
+  }
+  return out;
+#endif
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4(
+    uint16x4_t a0, uint16x4_t a1, int16x4_t shift) {
+  uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift));
+  res =
+      vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift)));
+  return vrshrn_n_u32(res, 5);
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8(
+    uint16x8_t a0, uint16x8_t a1, int16x8_t shift) {
+  return vcombine_u16(
+      highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1),
+                                             vget_low_s16(shift)),
+      highbd_dr_prediction_z2_apply_shift_x4(
+          vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift)));
+}
+
+static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4(
+    const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1,
+    const uint16_t *left, int dx, int dy, int r, int c) {
+  const int16x4_t iota = vld1_s16(iota1_s16);
+
+  const int x0 = (c << 6) - (r + 1) * dx;
+  const int y0 = (r << 6) - (c + 1) * dy;
+
+  const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6));
+  const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy));
+  const int16x4_t shift_x0123 =
+      vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+  const int16x4_t shift_y0123 =
+      vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1);
+  const int16x4_t base_y0123 = vshr_n_s16(y0123, 6);
+
+  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+  // Based on the value of `base_shift` there are three possible cases to
+  // compute the result:
+  // 1) base_shift <= 0: We can load and operate entirely on data from the
+  //                     `above` input vector.
+  // 2) base_shift < vl: We can load from `above[-1]` and shift
+  //                     `vl - base_shift` elements across to the end of the
+  //                     vector, then compute the remainder from `left`.
+  // 3) base_shift >= vl: We can load and operate entirely on data from the
+  //                      `left` input vector.
+
+  if (base_shift <= 0) {
+    const int base_x = x0 >> 6;
+    const uint16x4_t a0 = vld1_u16(above + base_x);
+    const uint16x4_t a1 = vld1_u16(above + base_x + 1);
+    return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+  } else if (base_shift < 4) {
+    const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4(
+        left + 1, base_y0123, base_shift);
+    const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4(
+        l01.val[0], l01.val[1], shift_y0123);
+
+    // No need to reload from above in the loop, just use pre-loaded constants.
+    const uint16x4_t out16_x =
+        highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123);
+
+    return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift);
+  } else {
+    const uint16x4x2_t l01 =
+        highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4);
+    return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1],
+                                                  shift_y0123);
+  }
+}
+
+static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8(
+    const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1,
+    const uint16_t *left, int dx, int dy, int r, int c) {
+  const int16x8_t iota = vld1q_s16(iota1_s16);
+
+  const int x0 = (c << 6) - (r + 1) * dx;
+  const int y0 = (r << 6) - (c + 1) * dy;
+
+  const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6));
+  const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy));
+  const int16x8_t shift_x01234567 =
+      vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1);
+  const int16x8_t shift_y01234567 =
+      vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1);
+  const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6);
+
+  const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c;
+
+  // Based on the value of `base_shift` there are three possible cases to
+  // compute the result:
+  // 1) base_shift <= 0: We can load and operate entirely on data from the
+  //                     `above` input vector.
+  // 2) base_shift < vl: We can load from `above[-1]` and shift
+  //                     `vl - base_shift` elements across to the end of the
+  //                     vector, then compute the remainder from `left`.
+  // 3) base_shift >= vl: We can load and operate entirely on data from the
+  //                      `left` input vector.
+
+  if (base_shift <= 0) {
+    const int base_x = x0 >> 6;
+    const uint16x8_t a0 = vld1q_u16(above + base_x);
+    const uint16x8_t a1 = vld1q_u16(above + base_x + 1);
+    return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+  } else if (base_shift < 8) {
+    const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8(
+        left + 1, base_y01234567, base_shift);
+    const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8(
+        l01.val[0], l01.val[1], shift_y01234567);
+
+    // No need to reload from above in the loop, just use pre-loaded constants.
+    const uint16x8_t out16_x =
+        highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567);
+
+    return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift);
+  } else {
+    const uint16x8x2_t l01 =
+        highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8);
+    return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1],
+                                                  shift_y01234567);
+  }
+}
+
+// Left array is accessed from -1 through `bh - 1` inclusive.
+// Above array is accessed from -1 through `bw - 1` inclusive.
+#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh)                                 \
+  static void highbd_dr_prediction_z2_##bw##x##bh##_neon(                  \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,              \
+      const uint16_t *left, int upsample_above, int upsample_left, int dx, \
+      int dy, int bd) {                                                    \
+    (void)bd;                                                              \
+    (void)upsample_above;                                                  \
+    (void)upsample_left;                                                   \
+    assert(!upsample_above);                                               \
+    assert(!upsample_left);                                                \
+    assert(bw % 4 == 0);                                                   \
+    assert(bh % 4 == 0);                                                   \
+    assert(dx > 0);                                                        \
+    assert(dy > 0);                                                        \
+                                                                           \
+    uint16_t left_data[bh + 1];                                            \
+    memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t));              \
+                                                                           \
+    uint16x8_t a0, a1;                                                     \
+    if (bw == 4) {                                                         \
+      a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0));               \
+      a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0));               \
+    } else {                                                               \
+      a0 = vld1q_u16(above - 1);                                           \
+      a1 = vld1q_u16(above + 0);                                           \
+    }                                                                      \
+                                                                           \
+    int r = 0;                                                             \
+    do {                                                                   \
+      if (bw == 4) {                                                       \
+        vst1_u16(dst, highbd_dr_prediction_z2_step_x4(                     \
+                          above, vget_low_u16(a0), vget_low_u16(a1),       \
+                          left_data, dx, dy, r, 0));                       \
+      } else {                                                             \
+        int c = 0;                                                         \
+        do {                                                               \
+          vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8(              \
+                                 above, a0, a1, left_data, dx, dy, r, c)); \
+          c += 8;                                                          \
+        } while (c < bw);                                                  \
+      }                                                                    \
+      dst += stride;                                                       \
+    } while (++r < bh);                                                    \
+  }
+
+HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32)
+HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64)
+
+#undef HIGHBD_DR_PREDICTOR_Z2_WXH
+
+typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left,
+                                            int upsample_above,
+                                            int upsample_left, int dx, int dy,
+                                            int bd);
+
+static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left,
+                                             int upsample_above,
+                                             int upsample_left, int dx, int dy,
+                                             int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+  // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+  uint16x8_t left_data0, left_data1;
+  if (upsample_left) {
+    left_data0 = vld1q_u16(left - 2);
+    left_data1 = vld1q_u16(left - 1);
+  } else {
+    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+  }
+#endif
+
+  const int16x4_t iota0123 = vld1_s16(iota1_s16);
+  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+  for (int r = 0; r < 4; ++r) {
+    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+    const int x0 = (r + 1) * dx;
+    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+    const int base_x0 = (-x0) >> frac_bits_x;
+    if (base_shift <= 0) {
+      uint16x4_t a0, a1;
+      int16x4_t shift_x0123;
+      if (upsample_above) {
+        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+      } else {
+        a0 = vld1_u16(above + base_x0);
+        a1 = vld1_u16(above + base_x0 + 1);
+        shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1);
+      }
+      vst1_u16(dst,
+               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+    } else if (base_shift < 4) {
+      // Calculate Y component from `left`.
+      const int y_iters = base_shift;
+      const int16x4_t y0123 =
+          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+      const int16x4_t shift_y0123 = vshr_n_s16(
+          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+      uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+                                                       left_data_base, y_iters);
+      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+                                                       left_data_base, y_iters);
+#else
+      const uint16x4x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      const uint16x4_t out_y =
+          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+      // Calculate X component from `above`.
+      const int16x4_t shift_x0123 = vshr_n_s16(
+          vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)),
+          1);
+      uint16x4_t a0, a1;
+      if (upsample_above) {
+        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+      } else {
+        a0 = vld1_u16(above - 1);
+        a1 = vld1_u16(above + 0);
+      }
+      const uint16x4_t out_x =
+          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+      // Combine X and Y vectors.
+      const uint16x4_t out =
+          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+      vst1_u16(dst, out);
+    } else {
+      const int16x4_t y0123 =
+          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+      const int16x4_t shift_y0123 = vshr_n_s16(
+          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+      uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123,
+                                                       left_data_base, 4);
+      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123,
+                                                       left_data_base, 4);
+#else
+      const uint16x4x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+      vst1_u16(dst,
+               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+    }
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left,
+                                             int upsample_above,
+                                             int upsample_left, int dx, int dy,
+                                             int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+  // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+  uint16x8x2_t left_data0, left_data1;
+  if (upsample_left) {
+    left_data0 = vld1q_u16_x2(left - 2);
+    left_data1 = vld1q_u16_x2(left - 1);
+  } else {
+    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+  }
+#endif
+
+  const int16x4_t iota0123 = vld1_s16(iota1_s16);
+  const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1);
+
+  for (int r = 0; r < 8; ++r) {
+    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+    const int x0 = (r + 1) * dx;
+    const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0));
+    const int base_x0 = (-x0) >> frac_bits_x;
+    if (base_shift <= 0) {
+      uint16x4_t a0, a1;
+      int16x4_t shift_x0123;
+      if (upsample_above) {
+        const uint16x4x2_t a01 = vld2_u16(above + base_x0);
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+      } else {
+        a0 = vld1_u16(above + base_x0);
+        a1 = vld1_u16(above + base_x0 + 1);
+        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+      }
+      vst1_u16(dst,
+               highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123));
+    } else if (base_shift < 4) {
+      // Calculate Y component from `left`.
+      const int y_iters = base_shift;
+      const int16x4_t y0123 =
+          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+      const int16x4_t shift_y0123 = vshr_n_s16(
+          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+      uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+          left_data0, base_y0123, left_data_base, y_iters);
+      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(
+          left_data1, base_y0123, left_data_base, y_iters);
+#else
+      const uint16x4x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      const uint16x4_t out_y =
+          highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123);
+
+      // Calculate X component from `above`.
+      uint16x4_t a0, a1;
+      int16x4_t shift_x0123;
+      if (upsample_above) {
+        const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F));
+      } else {
+        a0 = vld1_u16(above - 1);
+        a1 = vld1_u16(above + 0);
+        shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F));
+      }
+      const uint16x4_t out_x =
+          highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123);
+
+      // Combine X and Y vectors.
+      const uint16x4_t out =
+          highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift);
+      vst1_u16(dst, out);
+    } else {
+      const int16x4_t y0123 =
+          vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy));
+      const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y));
+      const int16x4_t shift_y0123 = vshr_n_s16(
+          vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1);
+
+      uint16x4_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123,
+                                                        left_data_base, 4);
+      l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123,
+                                                        left_data_base, 4);
+#else
+      const uint16x4x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      vst1_u16(dst,
+               highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123));
+    }
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left,
+                                             int upsample_above,
+                                             int upsample_left, int dx, int dy,
+                                             int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+  // if `upsample_left` then we need -2 through 6 inclusive from `left`.
+  // else we only need -1 through 3 inclusive.
+
+#if AOM_ARCH_AARCH64
+  uint16x8_t left_data0, left_data1;
+  if (upsample_left) {
+    left_data0 = vld1q_u16(left - 2);
+    left_data1 = vld1q_u16(left - 1);
+  } else {
+    left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0));
+    left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0));
+  }
+#endif
+
+  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+  for (int r = 0; r < 4; ++r) {
+    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+    const int x0 = (r + 1) * dx;
+    const int16x8_t x01234567 =
+        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+    const int base_x0 = (-x0) >> frac_bits_x;
+    if (base_shift <= 0) {
+      uint16x8_t a0, a1;
+      int16x8_t shift_x01234567;
+      if (upsample_above) {
+        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+      } else {
+        a0 = vld1q_u16(above + base_x0);
+        a1 = vld1q_u16(above + base_x0 + 1);
+        shift_x01234567 =
+            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+      }
+      vst1q_u16(
+          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+    } else if (base_shift < 8) {
+      // Calculate Y component from `left`.
+      const int y_iters = base_shift;
+      const int16x8_t y01234567 =
+          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+      const int16x8_t base_y01234567 =
+          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+      const int16x8_t shift_y01234567 =
+          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+                                vdupq_n_s16(0x3F)),
+                      1);
+
+      uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+          left_data0, base_y01234567, left_data_base, y_iters);
+      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+          left_data1, base_y01234567, left_data_base, y_iters);
+#else
+      const uint16x8x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      const uint16x8_t out_y =
+          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+      // Calculate X component from `above`.
+      uint16x8_t a0, a1;
+      int16x8_t shift_x01234567;
+      if (upsample_above) {
+        const uint16x8x2_t a01 =
+            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+      } else {
+        a0 = vld1q_u16(above - 1);
+        a1 = vld1q_u16(above + 0);
+        shift_x01234567 =
+            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+      }
+      const uint16x8_t out_x =
+          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+      // Combine X and Y vectors.
+      const uint16x8_t out =
+          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+      vst1q_u16(dst, out);
+    } else {
+      const int16x8_t y01234567 =
+          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+      const int16x8_t base_y01234567 =
+          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+      const int16x8_t shift_y01234567 =
+          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+                                vdupq_n_s16(0x3F)),
+                      1);
+
+      uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+          left_data0, base_y01234567, left_data_base, 8);
+      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8(
+          left_data1, base_y01234567, left_data_base, 8);
+#else
+      const uint16x8x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      vst1q_u16(
+          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+    }
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left,
+                                             int upsample_above,
+                                             int upsample_left, int dx, int dy,
+                                             int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+  const int min_base_x = -(1 << (upsample_above + frac_bits_x));
+
+  // if `upsample_left` then we need -2 through 14 inclusive from `left`.
+  // else we only need -1 through 6 inclusive.
+
+#if AOM_ARCH_AARCH64
+  uint16x8x2_t left_data0, left_data1;
+  if (upsample_left) {
+    left_data0 = vld1q_u16_x2(left - 2);
+    left_data1 = vld1q_u16_x2(left - 1);
+  } else {
+    left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } };
+    left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } };
+  }
+#endif
+
+  const int16x8_t iota01234567 = vld1q_s16(iota1_s16);
+  const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1);
+
+  for (int r = 0; r < 8; ++r) {
+    const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6;
+    const int x0 = (r + 1) * dx;
+    const int16x8_t x01234567 =
+        vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0));
+    const int base_x0 = (-x0) >> frac_bits_x;
+    if (base_shift <= 0) {
+      uint16x8_t a0, a1;
+      int16x8_t shift_x01234567;
+      if (upsample_above) {
+        const uint16x8x2_t a01 = vld2q_u16(above + base_x0);
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+      } else {
+        a0 = vld1q_u16(above + base_x0);
+        a1 = vld1q_u16(above + base_x0 + 1);
+        shift_x01234567 =
+            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+      }
+      vst1q_u16(
+          dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567));
+    } else if (base_shift < 8) {
+      // Calculate Y component from `left`.
+      const int y_iters = base_shift;
+      const int16x8_t y01234567 =
+          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+      const int16x8_t base_y01234567 =
+          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+      const int16x8_t shift_y01234567 =
+          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+                                vdupq_n_s16(0x3F)),
+                      1);
+
+      uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+          left_data0, base_y01234567, left_data_base, y_iters);
+      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+          left_data1, base_y01234567, left_data_base, y_iters);
+#else
+      const uint16x8x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      const uint16x8_t out_y =
+          highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567);
+
+      // Calculate X component from `above`.
+      uint16x8_t a0, a1;
+      int16x8_t shift_x01234567;
+      if (upsample_above) {
+        const uint16x8x2_t a01 =
+            vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1));
+        a0 = a01.val[0];
+        a1 = a01.val[1];
+        shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F));
+      } else {
+        a0 = vld1q_u16(above - 1);
+        a1 = vld1q_u16(above + 0);
+        shift_x01234567 =
+            vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F));
+      }
+      const uint16x8_t out_x =
+          highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567);
+
+      // Combine X and Y vectors.
+      const uint16x8_t out =
+          highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift);
+      vst1q_u16(dst, out);
+    } else {
+      const int16x8_t y01234567 =
+          vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy));
+      const int16x8_t base_y01234567 =
+          vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y));
+      const int16x8_t shift_y01234567 =
+          vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left),
+                                vdupq_n_s16(0x3F)),
+                      1);
+
+      uint16x8_t l0, l1;
+#if AOM_ARCH_AARCH64
+      const int left_data_base = upsample_left ? -2 : -1;
+      l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+          left_data0, base_y01234567, left_data_base, 8);
+      l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16(
+          left_data1, base_y01234567, left_data_base, 8);
+#else
+      const uint16x8x2_t l01 =
+          highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8);
+      l0 = l01.val[0];
+      l1 = l01.val[1];
+#endif
+
+      vst1q_u16(
+          dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567));
+    }
+    dst += stride;
+  }
+}
+
+static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = {
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon,
+    &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL,
+    NULL },
+  { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon,
+    &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon,
+    &highbd_dr_prediction_z2_8x32_neon, NULL },
+  { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon,
+    &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon,
+    &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon },
+  { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon,
+    &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon,
+    &highbd_dr_prediction_z2_32x64_neon },
+  { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon,
+    &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon },
+};
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy,
+                                      int bd) {
+  highbd_dr_prediction_z2_ptr f =
+      dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)];
+  assert(f != NULL);
+  f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd);
+}
+
+// -----------------------------------------------------------------------------
+// Z3
+
+// Both the lane to the use and the shift amount must be immediates.
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \
+                                       lane, shift)                       \
+  do {                                                                    \
+    uint32x4_t val = vmull_lane_u16((in0), (s0), (lane));                 \
+    val = vmlal_lane_u16(val, (in1), (s1), (lane));                       \
+    const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base));            \
+    const uint16x4_t res = vrshrn_n_u32(val, (shift));                    \
+    *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res,         \
+                      vdup_n_u16(left_max));                              \
+  } while (0)
+
+#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \
+                                       lane, shift)                       \
+  do {                                                                    \
+    uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane));  \
+    val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane));     \
+    uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \
+    val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane));    \
+    const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base));          \
+    const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)),    \
+                                        vrshrn_n_u32(val_hi, (shift)));   \
+    *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res,      \
+                       vdupq_n_u16(left_max));                            \
+  } while (0)
+
+static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst,
+                                                   ptrdiff_t stride, int bw,
+                                                   int bh, const uint16_t *left,
+                                                   int dy) {
+  assert(bw % 4 == 0);
+  assert(bh % 4 == 0);
+  assert(dy > 0);
+
+  // Factor out left + 1 to give the compiler a better chance of recognising
+  // that the offsets used for the loads from left and left + 1 are otherwise
+  // identical.
+  const uint16_t *left1 = left + 1;
+
+  const int max_base_y = (bw + bh - 1);
+  const int left_max = left[max_base_y];
+  const int frac_bits = 6;
+
+  const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16));
+  const uint16x4_t iota1x4 = vget_low_u16(iota1x8);
+
+  // The C implementation of the z3 predictor when not upsampling uses:
+  // ((y & 0x3f) >> 1)
+  // The right shift is unnecessary here since we instead shift by +1 later,
+  // so adjust the mask to 0x3e to ensure we don't consider the extra bit.
+  const uint16x4_t shift_mask = vdup_n_u16(0x3e);
+
+  if (bh == 4) {
+    int y = dy;
+    int c = 0;
+    do {
+      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+      // multiply instructions.
+      const uint16x4_t shifts1 =
+          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+      const int base0 = (y + 0 * dy) >> frac_bits;
+      const int base1 = (y + 1 * dy) >> frac_bits;
+      const int base2 = (y + 2 * dy) >> frac_bits;
+      const int base3 = (y + 3 * dy) >> frac_bits;
+      uint16x4_t out[4];
+      if (base0 >= max_base_y) {
+        out[0] = vdup_n_u16(left_max);
+      } else {
+        const uint16x4_t l00 = vld1_u16(left + base0);
+        const uint16x4_t l01 = vld1_u16(left1 + base0);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01,
+                                       shifts0, shifts1, 0, 6);
+      }
+      if (base1 >= max_base_y) {
+        out[1] = vdup_n_u16(left_max);
+      } else {
+        const uint16x4_t l10 = vld1_u16(left + base1);
+        const uint16x4_t l11 = vld1_u16(left1 + base1);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11,
+                                       shifts0, shifts1, 1, 6);
+      }
+      if (base2 >= max_base_y) {
+        out[2] = vdup_n_u16(left_max);
+      } else {
+        const uint16x4_t l20 = vld1_u16(left + base2);
+        const uint16x4_t l21 = vld1_u16(left1 + base2);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21,
+                                       shifts0, shifts1, 2, 6);
+      }
+      if (base3 >= max_base_y) {
+        out[3] = vdup_n_u16(left_max);
+      } else {
+        const uint16x4_t l30 = vld1_u16(left + base3);
+        const uint16x4_t l31 = vld1_u16(left1 + base3);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31,
+                                       shifts0, shifts1, 3, 6);
+      }
+      transpose_array_inplace_u16_4x4(out);
+      for (int r2 = 0; r2 < 4; ++r2) {
+        vst1_u16(dst + r2 * stride + c, out[r2]);
+      }
+      y += 4 * dy;
+      c += 4;
+    } while (c < bw);
+  } else {
+    int y = dy;
+    int c = 0;
+    do {
+      int r = 0;
+      do {
+        // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+        // multiply instructions.
+        const uint16x4_t shifts1 =
+            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1);
+        const int base0 = ((y + 0 * dy) >> frac_bits) + r;
+        const int base1 = ((y + 1 * dy) >> frac_bits) + r;
+        const int base2 = ((y + 2 * dy) >> frac_bits) + r;
+        const int base3 = ((y + 3 * dy) >> frac_bits) + r;
+        uint16x8_t out[4];
+        if (base0 >= max_base_y) {
+          out[0] = vdupq_n_u16(left_max);
+        } else {
+          const uint16x8_t l00 = vld1q_u16(left + base0);
+          const uint16x8_t l01 = vld1q_u16(left1 + base0);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01,
+                                         shifts0, shifts1, 0, 6);
+        }
+        if (base1 >= max_base_y) {
+          out[1] = vdupq_n_u16(left_max);
+        } else {
+          const uint16x8_t l10 = vld1q_u16(left + base1);
+          const uint16x8_t l11 = vld1q_u16(left1 + base1);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11,
+                                         shifts0, shifts1, 1, 6);
+        }
+        if (base2 >= max_base_y) {
+          out[2] = vdupq_n_u16(left_max);
+        } else {
+          const uint16x8_t l20 = vld1q_u16(left + base2);
+          const uint16x8_t l21 = vld1q_u16(left1 + base2);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21,
+                                         shifts0, shifts1, 2, 6);
+        }
+        if (base3 >= max_base_y) {
+          out[3] = vdupq_n_u16(left_max);
+        } else {
+          const uint16x8_t l30 = vld1q_u16(left + base3);
+          const uint16x8_t l31 = vld1q_u16(left1 + base3);
+          HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31,
+                                         shifts0, shifts1, 3, 6);
+        }
+        transpose_array_inplace_u16_4x8(out);
+        for (int r2 = 0; r2 < 4; ++r2) {
+          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+        }
+        for (int r2 = 0; r2 < 4; ++r2) {
+          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+        }
+        r += 8;
+      } while (r < bh);
+      y += 4 * dy;
+      c += 4;
+    } while (c < bw);
+  }
+}
+
+static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst,
+                                                   ptrdiff_t stride, int bw,
+                                                   int bh, const uint16_t *left,
+                                                   int dy) {
+  assert(bw % 4 == 0);
+  assert(bh % 4 == 0);
+  assert(dy > 0);
+
+  const int max_base_y = (bw + bh - 1) << 1;
+  const int left_max = left[max_base_y];
+  const int frac_bits = 5;
+
+  const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16));
+  const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16));
+  const uint16x4_t iota2x4 = vget_low_u16(iota2x8);
+
+  // The C implementation of the z3 predictor when upsampling uses:
+  // (((x << 1) & 0x3f) >> 1)
+  // The two shifts are unnecessary here since the lowest bit is guaranteed to
+  // be zero when the mask is applied, so adjust the mask to 0x1f to avoid
+  // needing the shifts at all.
+  const uint16x4_t shift_mask = vdup_n_u16(0x1F);
+
+  if (bh == 4) {
+    int y = dy;
+    int c = 0;
+    do {
+      // Fully unroll the 4x4 block to allow us to use immediate lane-indexed
+      // multiply instructions.
+      const uint16x4_t shifts1 =
+          vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+      const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+      const int base0 = (y + 0 * dy) >> frac_bits;
+      const int base1 = (y + 1 * dy) >> frac_bits;
+      const int base2 = (y + 2 * dy) >> frac_bits;
+      const int base3 = (y + 3 * dy) >> frac_bits;
+      const uint16x4x2_t l0 = vld2_u16(left + base0);
+      const uint16x4x2_t l1 = vld2_u16(left + base1);
+      const uint16x4x2_t l2 = vld2_u16(left + base2);
+      const uint16x4x2_t l3 = vld2_u16(left + base3);
+      uint16x4_t out[4];
+      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0],
+                                     l0.val[1], shifts0, shifts1, 0, 5);
+      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0],
+                                     l1.val[1], shifts0, shifts1, 1, 5);
+      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0],
+                                     l2.val[1], shifts0, shifts1, 2, 5);
+      HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0],
+                                     l3.val[1], shifts0, shifts1, 3, 5);
+      transpose_array_inplace_u16_4x4(out);
+      for (int r2 = 0; r2 < 4; ++r2) {
+        vst1_u16(dst + r2 * stride + c, out[r2]);
+      }
+      y += 4 * dy;
+      c += 4;
+    } while (c < bw);
+  } else {
+    assert(bh % 8 == 0);
+
+    int y = dy;
+    int c = 0;
+    do {
+      int r = 0;
+      do {
+        // Fully unroll the 4x8 block to allow us to use immediate lane-indexed
+        // multiply instructions.
+        const uint16x4_t shifts1 =
+            vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask);
+        const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1);
+        const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2);
+        const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2);
+        const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2);
+        const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2);
+        const uint16x8x2_t l0 = vld2q_u16(left + base0);
+        const uint16x8x2_t l1 = vld2q_u16(left + base1);
+        const uint16x8x2_t l2 = vld2q_u16(left + base2);
+        const uint16x8x2_t l3 = vld2q_u16(left + base3);
+        uint16x8_t out[4];
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0],
+                                       l0.val[1], shifts0, shifts1, 0, 5);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0],
+                                       l1.val[1], shifts0, shifts1, 1, 5);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0],
+                                       l2.val[1], shifts0, shifts1, 2, 5);
+        HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0],
+                                       l3.val[1], shifts0, shifts1, 3, 5);
+        transpose_array_inplace_u16_4x8(out);
+        for (int r2 = 0; r2 < 4; ++r2) {
+          vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2]));
+        }
+        for (int r2 = 0; r2 < 4; ++r2) {
+          vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2]));
+        }
+        r += 8;
+      } while (r < bh);
+      y += 4 * dy;
+      c += 4;
+    } while (c < bw);
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_left,
+                                      int dx, int dy, int bd) {
+  (void)above;
+  (void)dx;
+  (void)bd;
+  assert(bw % 4 == 0);
+  assert(bh % 4 == 0);
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (upsample_left) {
+    highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy);
+  } else {
+    highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy);
+  }
+}
+
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4
+#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8
diff --git a/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
new file mode 100644
index 0000000000..77727b7665
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_loopfilter_neon.c
@@ -0,0 +1,1265 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE int16x4_t clip3_s16(const int16x4_t val, const int16x4_t low,
+                                  const int16x4_t high) {
+  return vmin_s16(vmax_s16(val, low), high);
+}
+
+static INLINE uint16x8_t convert_to_unsigned_pixel_u16(int16x8_t val,
+                                                       int bitdepth) {
+  const int16x8_t low = vdupq_n_s16(0);
+  const uint16x8_t high = vdupq_n_u16((1 << bitdepth) - 1);
+
+  return vminq_u16(vreinterpretq_u16_s16(vmaxq_s16(val, low)), high);
+}
+
+// (abs(p1 - p0) > thresh) || (abs(q1 - q0) > thresh)
+static INLINE uint16x4_t hev(const uint16x8_t abd_p0p1_q0q1,
+                             const uint16_t thresh) {
+  const uint16x8_t a = vcgtq_u16(abd_p0p1_q0q1, vdupq_n_u16(thresh));
+  return vorr_u16(vget_low_u16(a), vget_high_u16(a));
+}
+
+// abs(p0 - q0) * 2 + abs(p1 - q1) / 2 <= outer_thresh
+static INLINE uint16x4_t outer_threshold(const uint16x4_t p1,
+                                         const uint16x4_t p0,
+                                         const uint16x4_t q0,
+                                         const uint16x4_t q1,
+                                         const uint16_t outer_thresh) {
+  const uint16x4_t abd_p0q0 = vabd_u16(p0, q0);
+  const uint16x4_t abd_p1q1 = vabd_u16(p1, q1);
+  const uint16x4_t p0q0_double = vshl_n_u16(abd_p0q0, 1);
+  const uint16x4_t p1q1_half = vshr_n_u16(abd_p1q1, 1);
+  const uint16x4_t sum = vadd_u16(p0q0_double, p1q1_half);
+  return vcle_u16(sum, vdup_n_u16(outer_thresh));
+}
+
+// abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter4(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vcleq_u16(abd_p0p1_q0q1, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(a), vget_high_u16(a));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p2 - p1) <= inner_thresh && abs(p1 - p0) <= inner_thresh &&
+//   abs(q1 - q0) <= inner_thresh && abs(q2 - q1) <= inner_thresh &&
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter6(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16x8_t abd_p1p2_q1q2,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(b), vget_high_u16(b));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// abs(p3 - p2) <= inner_thresh && abs(p2 - p1) <= inner_thresh &&
+//   abs(p1 - p0) <= inner_thresh && abs(q1 - q0) <= inner_thresh &&
+//   abs(q2 - q1) <= inner_thresh && abs(q3 - q2) <= inner_thresh
+//   outer_threshold()
+static INLINE uint16x4_t needs_filter8(const uint16x8_t abd_p0p1_q0q1,
+                                       const uint16x8_t abd_p1p2_q1q2,
+                                       const uint16x8_t abd_p2p3_q2q3,
+                                       const uint16_t inner_thresh,
+                                       const uint16x4_t outer_mask) {
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p1p2_q1q2);
+  const uint16x8_t b = vmaxq_u16(a, abd_p2p3_q2q3);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(inner_thresh));
+  const uint16x4_t inner_mask = vand_u16(vget_low_u16(c), vget_high_u16(c));
+  return vand_u16(inner_mask, outer_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN_masks functions.
+
+static INLINE void filter4_masks(const uint16x8_t p0q0, const uint16x8_t p1q1,
+                                 const uint16_t hev_thresh,
+                                 const uint16x4_t outer_mask,
+                                 const uint16_t inner_thresh,
+                                 uint16x4_t *const hev_mask,
+                                 uint16x4_t *const needs_filter4_mask) {
+  const uint16x8_t p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  // This includes cases where needs_filter4() is not true and so filter2() will
+  // not be applied.
+  const uint16x4_t hev_tmp_mask = hev(p0p1_q0q1, hev_thresh);
+
+  *needs_filter4_mask = needs_filter4(p0p1_q0q1, inner_thresh, outer_mask);
+
+  // filter2() will only be applied if both needs_filter4() and hev() are true.
+  *hev_mask = vand_u16(hev_tmp_mask, *needs_filter4_mask);
+}
+
+// abs(p1 - p0) <= flat_thresh && abs(q1 - q0) <= flat_thresh &&
+//   abs(p2 - p0) <= flat_thresh && abs(q2 - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat3(const uint16x8_t abd_p0p1_q0q1,
+                                  const uint16x8_t abd_p0p2_q0q2,
+                                  const int bitdepth) {
+  const int flat_thresh = 1 << (bitdepth - 8);
+  const uint16x8_t a = vmaxq_u16(abd_p0p1_q0q1, abd_p0p2_q0q2);
+  const uint16x8_t b = vcleq_u16(a, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(b), vget_high_u16(b));
+}
+
+static INLINE void filter6_masks(
+    const uint16x8_t p2q2, const uint16x8_t p1q1, const uint16x8_t p0q0,
+    const uint16_t hev_thresh, const uint16x4_t outer_mask,
+    const uint16_t inner_thresh, const int bitdepth,
+    uint16x4_t *const needs_filter6_mask, uint16x4_t *const is_flat3_mask,
+    uint16x4_t *const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+  *is_flat3_mask = is_flat3(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2), bitdepth);
+  *needs_filter6_mask = needs_filter6(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2),
+                                      inner_thresh, outer_mask);
+}
+
+// is_flat4 uses N=1, IsFlatOuter4 uses N=4.
+// abs(p[N] - p0) <= flat_thresh && abs(q[N] - q0) <= flat_thresh &&
+//   abs(p[N+1] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh &&
+//   abs(p[N+2] - p0) <= flat_thresh && abs(q[N+1] - q0) <= flat_thresh
+// |flat_thresh| == 4 for 10 bit decode.
+static INLINE uint16x4_t is_flat4(const uint16x8_t abd_pnp0_qnq0,
+                                  const uint16x8_t abd_pn1p0_qn1q0,
+                                  const uint16x8_t abd_pn2p0_qn2q0,
+                                  const int bitdepth) {
+  const int flat_thresh = 1 << (bitdepth - 8);
+  const uint16x8_t a = vmaxq_u16(abd_pnp0_qnq0, abd_pn1p0_qn1q0);
+  const uint16x8_t b = vmaxq_u16(a, abd_pn2p0_qn2q0);
+  const uint16x8_t c = vcleq_u16(b, vdupq_n_u16(flat_thresh));
+  return vand_u16(vget_low_u16(c), vget_high_u16(c));
+}
+
+static INLINE void filter8_masks(
+    const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+    const uint16x8_t p0q0, const uint16_t hev_thresh,
+    const uint16x4_t outer_mask, const uint16_t inner_thresh,
+    const int bitdepth, uint16x4_t *const needs_filter8_mask,
+    uint16x4_t *const is_flat4_mask, uint16x4_t *const hev_mask) {
+  const uint16x8_t abd_p0p1_q0q1 = vabdq_u16(p0q0, p1q1);
+  *hev_mask = hev(abd_p0p1_q0q1, hev_thresh);
+  const uint16x4_t v_is_flat4 = is_flat4(abd_p0p1_q0q1, vabdq_u16(p0q0, p2q2),
+                                         vabdq_u16(p0q0, p3q3), bitdepth);
+  *needs_filter8_mask =
+      needs_filter8(abd_p0p1_q0q1, vabdq_u16(p1q1, p2q2), vabdq_u16(p2q2, p3q3),
+                    inner_thresh, outer_mask);
+  // |is_flat4_mask| is used to decide where to use the result of filter8.
+  // In rare cases, |is_flat4| can be true where |needs_filter8_mask| is false,
+  // overriding the question of whether to use filter8. Because filter4 doesn't
+  // apply to p2q2, |is_flat4_mask| chooses directly between filter8 and the
+  // source value. To be correct, the mask must account for this override.
+  *is_flat4_mask = vand_u16(v_is_flat4, *needs_filter8_mask);
+}
+
+// -----------------------------------------------------------------------------
+// filterN functions.
+
+// Calculate filter4() or filter2() based on |hev_mask|.
+static INLINE void filter4(const uint16x8_t p0q0, const uint16x8_t p0q1,
+                           const uint16x8_t p1q1, const uint16x4_t hev_mask,
+                           int bitdepth, uint16x8_t *const p1q1_result,
+                           uint16x8_t *const p0q0_result) {
+  const uint16x8_t q0p1 = vextq_u16(p0q0, p1q1, 4);
+  // a = 3 * (q0 - p0) + Clip3(p1 - q1, min_signed_val, max_signed_val);
+  // q0mp0 means "q0 minus p0".
+  const int16x8_t q0mp0_p1mq1 = vreinterpretq_s16_u16(vsubq_u16(q0p1, p0q1));
+  const int16x4_t q0mp0_3 = vmul_n_s16(vget_low_s16(q0mp0_p1mq1), 3);
+
+  // If this is for filter2() then include |p1mq1|. Otherwise zero it.
+  const int16x4_t min_signed_pixel = vdup_n_s16(-(1 << (bitdepth - 1)));
+  const int16x4_t max_signed_pixel = vdup_n_s16((1 << (bitdepth - 1)) - 1);
+  const int16x4_t p1mq1 = vget_high_s16(q0mp0_p1mq1);
+  const int16x4_t p1mq1_saturated =
+      clip3_s16(p1mq1, min_signed_pixel, max_signed_pixel);
+  const int16x4_t hev_option =
+      vand_s16(vreinterpret_s16_u16(hev_mask), p1mq1_saturated);
+
+  const int16x4_t a = vadd_s16(q0mp0_3, hev_option);
+
+  // Need to figure out what's going on here because there are some unnecessary
+  // tricks to accommodate 8x8 as smallest 8bpp vector
+
+  // We can not shift with rounding because the clamp comes *before* the
+  // shifting. a1 = Clip3(a + 4, min_signed_val, max_signed_val) >> 3; a2 =
+  // Clip3(a + 3, min_signed_val, max_signed_val) >> 3;
+  const int16x4_t plus_four =
+      clip3_s16(vadd_s16(a, vdup_n_s16(4)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t plus_three =
+      clip3_s16(vadd_s16(a, vdup_n_s16(3)), min_signed_pixel, max_signed_pixel);
+  const int16x4_t a1 = vshr_n_s16(plus_four, 3);
+  const int16x4_t a2 = vshr_n_s16(plus_three, 3);
+
+  // a3 = (a1 + 1) >> 1;
+  const int16x4_t a3 = vrshr_n_s16(a1, 1);
+
+  const int16x8_t a3_ma3 = vcombine_s16(a3, vneg_s16(a3));
+  const int16x8_t p1q1_a3 = vaddq_s16(vreinterpretq_s16_u16(p1q1), a3_ma3);
+
+  // Need to shift the second term or we end up with a2_ma2.
+  const int16x8_t a2_ma1 = vcombine_s16(a2, vneg_s16(a1));
+  const int16x8_t p0q0_a = vaddq_s16(vreinterpretq_s16_u16(p0q0), a2_ma1);
+  *p1q1_result = convert_to_unsigned_pixel_u16(p1q1_a3, bitdepth);
+  *p0q0_result = convert_to_unsigned_pixel_u16(p0q0_a, bitdepth);
+}
+
+void aom_highbd_lpf_horizontal_4_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p1 = (uint16_t *)(s - 2 * pitch);
+  uint16_t *const dst_p0 = (uint16_t *)(s - pitch);
+  uint16_t *const dst_q0 = (uint16_t *)(s);
+  uint16_t *const dst_q1 = (uint16_t *)(s + pitch);
+
+  const uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0),
+                              vld1_u16(dst_q0), vld1_u16(dst_q1) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+                &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+  // Already integrated the hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  // Offset by 2 uint16_t values to load from first p1 position.
+  uint16_t *dst = s - 2;
+  uint16_t *dst_p1 = dst;
+  uint16_t *dst_p0 = dst + pitch;
+  uint16_t *dst_q0 = dst + pitch * 2;
+  uint16_t *dst_q1 = dst + pitch * 3;
+
+  uint16x4_t src[4] = { vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+                        vld1_u16(dst_q1) };
+  transpose_array_inplace_u16_4x4(src);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[0], src[1], src[2], src[3], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[1], src[2]);
+  const uint16x8_t p1q1 = vcombine_u16(src[0], src[3]);
+  filter4_masks(p0q0, p1q1, hev_thresh, outer_mask, inner_thresh, &hev_mask,
+                &needs_filter4_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter4_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter4_mask_8 =
+      vcombine_u16(needs_filter4_mask, needs_filter4_mask);
+
+  uint16x8_t f_p1q1;
+  uint16x8_t f_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(src[1], src[3]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f_p1q1, &f_p0q0);
+
+  // Already integrated the hev mask when calculating the filtered values.
+  const uint16x8_t p0q0_output = vbslq_u16(needs_filter4_mask_8, f_p0q0, p0q0);
+
+  // p1/q1 are unmodified if only hev() is true. This works because it was and'd
+  // with |needs_filter4_mask| previously.
+  const uint16x8_t p1q1_mask = veorq_u16(hev_mask_8, needs_filter4_mask_8);
+  const uint16x8_t p1q1_output = vbslq_u16(p1q1_mask, f_p1q1, p1q1);
+
+  uint16x4_t output[4] = {
+    vget_low_u16(p1q1_output),
+    vget_low_u16(p0q0_output),
+    vget_high_u16(p0q0_output),
+    vget_high_u16(p1q1_output),
+  };
+  transpose_array_inplace_u16_4x4(output);
+
+  vst1_u16(dst_p1, output[0]);
+  vst1_u16(dst_p0, output[1]);
+  vst1_u16(dst_q0, output[2]);
+  vst1_u16(dst_q1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_4_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter6(const uint16x8_t p2q2, const uint16x8_t p1q1,
+                           const uint16x8_t p0q0, uint16x8_t *const p1q1_output,
+                           uint16x8_t *const p0q0_output) {
+  // Sum p1 and q1 output from opposite directions.
+  // The formula is regrouped to allow 3 doubling operations to be combined.
+  //
+  // p1 = (3 * p2) + (2 * p1) + (2 * p0) + q0
+  //      ^^^^^^^^
+  // q1 = p0 + (2 * q0) + (2 * q1) + (3 * q2)
+  //                                 ^^^^^^^^
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                    ^^^^^^^^^^^
+  uint16x8_t sum = vaddq_u16(p2q2, p1q1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //                                ^^^^^^
+  sum = vaddq_u16(sum, p0q0);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //               ^^^^^
+  sum = vshlq_n_u16(sum, 1);
+
+  // p1q1 = p2q2 + 2 * (p2q2 + p1q1 + p0q0) + q0p0
+  //        ^^^^^^                          ^^^^^^
+  // Should dual issue with the left shift.
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+  const uint16x8_t outer_sum = vaddq_u16(p2q2, q0p0);
+  sum = vaddq_u16(sum, outer_sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - (2 * p2) + q0 + q1
+  // q0 = q1 - (2 * q2) + p0 + p1
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //                ^^^^^^^^
+  const uint16x8_t p2q2_double = vshlq_n_u16(p2q2, 1);
+  // p0q0 = p1q1 - (2 * p2q2) + q0p0 + q1p1
+  //        ^^^^^^^^
+  sum = vsubq_u16(sum, p2q2_double);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+  sum = vaddq_u16(sum, vaddq_u16(q0p0, q1p1));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_6_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+
+  const uint16x4_t src[6] = { vld1_u16(dst_p2), vld1_u16(dst_p1),
+                              vld1_u16(dst_p0), vld1_u16(dst_q0),
+                              vld1_u16(dst_q1), vld1_u16(dst_q2) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+                &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // filter6() does not apply, but filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  // Left side of the filter window.
+  uint16_t *const dst = s - 3;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // Overread by 2 values. These overreads become the high halves of src_raw[2]
+  // and src_raw[3] after transpose.
+  uint16x8_t src_raw[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1),
+                            vld1q_u16(dst_2), vld1q_u16(dst_3) };
+  transpose_array_inplace_u16_4x8(src_raw);
+  // p2, p1, p0, q0, q1, q2
+  const uint16x4_t src[6] = {
+    vget_low_u16(src_raw[0]),  vget_low_u16(src_raw[1]),
+    vget_low_u16(src_raw[2]),  vget_low_u16(src_raw[3]),
+    vget_high_u16(src_raw[0]), vget_high_u16(src_raw[1]),
+  };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[1], src[2], src[3], src[4], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat3_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[2], src[3]);
+  const uint16x8_t p1q1 = vcombine_u16(src[1], src[4]);
+  const uint16x8_t p2q2 = vcombine_u16(src[0], src[5]);
+  filter6_masks(p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh, bd,
+                &needs_filter_mask, &is_flat3_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t is_flat3_mask_8 = vcombine_u16(is_flat3_mask, is_flat3_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[2], src[4]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat3_mask| controls whether the needed filter is filter4 or
+  // filter6. Therefore if it is false when |needs_filter_mask| is true, filter6
+  // output is not used.
+  uint16x8_t f6_p1q1, f6_p0q0;
+  const uint64x1_t need_filter6 = vreinterpret_u64_u16(is_flat3_mask);
+  if (vget_lane_u64(need_filter6, 0) == 0) {
+    // filter6() does not apply, but filter4() applies to one or more values.
+    p0q0_output = p0q0;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    filter6(p2q2, p1q1, p0q0, &f6_p1q1, &f6_p0q0);
+    p1q1_output = vbslq_u16(is_flat3_mask_8, f6_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat3_mask_8, f6_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x4_t output[4] = {
+    vget_low_u16(p1q1_output),
+    vget_low_u16(p0q0_output),
+    vget_high_u16(p0q0_output),
+    vget_high_u16(p1q1_output),
+  };
+  transpose_array_inplace_u16_4x4(output);
+
+  // dst_n starts at p2, so adjust to p1.
+  vst1_u16(dst_0 + 1, output[0]);
+  vst1_u16(dst_1 + 1, output[1]);
+  vst1_u16(dst_2 + 1, output[2]);
+  vst1_u16(dst_3 + 1, output[3]);
+}
+
+void aom_highbd_lpf_vertical_6_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter8(const uint16x8_t p3q3, const uint16x8_t p2q2,
+                           const uint16x8_t p1q1, const uint16x8_t p0q0,
+                           uint16x8_t *const p2q2_output,
+                           uint16x8_t *const p1q1_output,
+                           uint16x8_t *const p0q0_output) {
+  // Sum p2 and q2 output from opposite directions.
+  // The formula is regrouped to allow 2 doubling operations to be combined.
+  // p2 = (3 * p3) + (2 * p2) + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q2 = p0 + q0 + q1 + (2 * q2) + (3 * q3)
+  //                                ^^^^^^^^
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                    ^^^^^^^^^^^
+  const uint16x8_t p23q23 = vaddq_u16(p3q3, p2q2);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //               ^^^^^
+  uint16x8_t sum = vshlq_n_u16(p23q23, 1);
+
+  // Add two other terms to make dual issue with shift more likely.
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                   ^^^^^^^^^^^
+  const uint16x8_t p01q01 = vaddq_u16(p0q0, p1q1);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                 ^^^^^^^^^^^^^
+  sum = vaddq_u16(sum, p01q01);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //        ^^^^^^
+  sum = vaddq_u16(sum, p3q3);
+
+  // p2q2 = p3q3 + 2 * (p3q3 + p2q2) + p1q1 + p0q0 + q0p0
+  //                                               ^^^^^^
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p2q2_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p3 - p2 + p1 + q1
+  // q1 = q2 - q3 - q2 + q0 + p1
+  sum = vsubq_u16(sum, p23q23);
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+  sum = vaddq_u16(sum, vaddq_u16(p1q1, q1p1));
+
+  *p1q1_output = vrshrq_n_u16(sum, 3);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p3 - p1 + p0 + q2
+  // q0 = q1 - q3 - q1 + q0 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p3q3, p1q1));
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+  sum = vaddq_u16(sum, vaddq_u16(p0q0, q2p2));
+
+  *p0q0_output = vrshrq_n_u16(sum, 3);
+}
+
+void aom_highbd_lpf_horizontal_8_neon(uint16_t *s, int pitch,
+                                      const uint8_t *blimit,
+                                      const uint8_t *limit,
+                                      const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p3 = s - 4 * pitch;
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+  uint16_t *const dst_q3 = s + 3 * pitch;
+
+  const uint16x4_t src[8] = { vld1_u16(dst_p3), vld1_u16(dst_p2),
+                              vld1_u16(dst_p1), vld1_u16(dst_p0),
+                              vld1_u16(dst_q0), vld1_u16(dst_q1),
+                              vld1_u16(dst_q2), vld1_u16(dst_q3) };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[2], src[3], src[4], src[5], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[3], src[4]);
+  const uint16x8_t p1q1 = vcombine_u16(src[2], src[5]);
+  const uint16x8_t p2q2 = vcombine_u16(src[1], src[6]);
+  const uint16x8_t p3q3 = vcombine_u16(src[0], src[7]);
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[3], src[5]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() does not apply, but filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+}
+
+void aom_highbd_lpf_horizontal_8_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8_t reverse_low_half(const uint16x8_t a) {
+  return vcombine_u16(vrev64_u16(vget_low_u16(a)), vget_high_u16(a));
+}
+
+void aom_highbd_lpf_vertical_8_neon(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  uint16_t *const dst = s - 4;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // src_raw[n] contains p3, p2, p1, p0, q0, q1, q2, q3 for row n.
+  // To get desired pairs after transpose, one half should be reversed.
+  uint16x8_t src[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                        vld1q_u16(dst_3) };
+
+  // src[0] = p0q0
+  // src[1] = p1q1
+  // src[2] = p2q2
+  // src[3] = p3q3
+  loop_filter_transpose_u16_4x8q(src);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask = outer_threshold(
+      vget_low_u16(src[1]), vget_low_u16(src[0]), vget_high_u16(src[0]),
+      vget_high_u16(src[1]), outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = src[0];
+  const uint16x8_t p1q1 = src[1];
+  const uint16x8_t p2q2 = src[2];
+  const uint16x8_t p3q3 = src[3];
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() does not apply, but filter4() applies to one or more values.
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t is_flat4_mask_8 =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    p2q2_output = vbslq_u16(is_flat4_mask_8, f8_p2q2, p2q2);
+    p1q1_output = vbslq_u16(is_flat4_mask_8, f8_p1q1, f4_p1q1);
+    p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+    p0q0_output = vbslq_u16(is_flat4_mask_8, f8_p0q0, f4_p0q0);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+  }
+
+  uint16x8_t output[4] = { p0q0_output, p1q1_output, p2q2_output, p3q3 };
+  // After transpose, |output| will contain rows of the form:
+  // p0 p1 p2 p3 q0 q1 q2 q3
+  transpose_array_inplace_u16_4x8(output);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, reverse_low_half(output[0]));
+  vst1q_u16(dst_1, reverse_low_half(output[1]));
+  vst1q_u16(dst_2, reverse_low_half(output[2]));
+  vst1q_u16(dst_3, reverse_low_half(output[3]));
+}
+
+void aom_highbd_lpf_vertical_8_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                 bd);
+}
+
+static INLINE void filter14(
+    const uint16x8_t p6q6, const uint16x8_t p5q5, const uint16x8_t p4q4,
+    const uint16x8_t p3q3, const uint16x8_t p2q2, const uint16x8_t p1q1,
+    const uint16x8_t p0q0, uint16x8_t *const p5q5_output,
+    uint16x8_t *const p4q4_output, uint16x8_t *const p3q3_output,
+    uint16x8_t *const p2q2_output, uint16x8_t *const p1q1_output,
+    uint16x8_t *const p0q0_output) {
+  // Sum p5 and q5 output from opposite directions.
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //      ^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                                                     ^^^^^^^^
+  const uint16x8_t p6q6_x7 = vsubq_u16(vshlq_n_u16(p6q6, 3), p6q6);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                 ^^^^^^^^^^^^^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                               ^^^^^^^^^^^^^^^^^^^
+  uint16x8_t sum = vshlq_n_u16(vaddq_u16(p5q5, p4q4), 1);
+  sum = vaddq_u16(sum, p6q6_x7);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                       ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //                     ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p3q3, p2q2), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                 ^^^^^^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //           ^^^^^^^
+  sum = vaddq_u16(vaddq_u16(p1q1, p0q0), sum);
+
+  // p5 = (7 * p6) + (2 * p5) + (2 * p4) + p3 + p2 + p1 + p0 + q0
+  //                                                           ^^
+  // q5 = p0 + q0 + q1 + q2 + q3 + (2 * q4) + (2 * q5) + (7 * q6)
+  //      ^^
+  const uint16x8_t q0p0 = vextq_u16(p0q0, p0q0, 4);
+  sum = vaddq_u16(sum, q0p0);
+
+  *p5q5_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p4 and q4 output:
+  // p4 = p5 - (2 * p6) + p3 + q1
+  // q4 = q5 - (2 * q6) + q3 + p1
+  sum = vsubq_u16(sum, vshlq_n_u16(p6q6, 1));
+  const uint16x8_t q1p1 = vextq_u16(p1q1, p1q1, 4);
+  sum = vaddq_u16(vaddq_u16(p3q3, q1p1), sum);
+
+  *p4q4_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p3 and q3 output:
+  // p3 = p4 - p6 - p5 + p2 + q2
+  // q3 = q4 - q6 - q5 + q2 + p2
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p5q5));
+  const uint16x8_t q2p2 = vextq_u16(p2q2, p2q2, 4);
+  sum = vaddq_u16(vaddq_u16(p2q2, q2p2), sum);
+
+  *p3q3_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p2 and q2 output:
+  // p2 = p3 - p6 - p4 + p1 + q3
+  // q2 = q3 - q6 - q4 + q1 + p3
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p4q4));
+  const uint16x8_t q3p3 = vextq_u16(p3q3, p3q3, 4);
+  sum = vaddq_u16(vaddq_u16(p1q1, q3p3), sum);
+
+  *p2q2_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p1 and q1 output:
+  // p1 = p2 - p6 - p3 + p0 + q4
+  // q1 = q2 - q6 - q3 + q0 + p4
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p3q3));
+  const uint16x8_t q4p4 = vextq_u16(p4q4, p4q4, 4);
+  sum = vaddq_u16(vaddq_u16(p0q0, q4p4), sum);
+
+  *p1q1_output = vrshrq_n_u16(sum, 4);
+
+  // Convert to p0 and q0 output:
+  // p0 = p1 - p6 - p2 + q0 + q5
+  // q0 = q1 - q6 - q2 + p0 + p5
+  sum = vsubq_u16(sum, vaddq_u16(p6q6, p2q2));
+  const uint16x8_t q5p5 = vextq_u16(p5q5, p5q5, 4);
+  sum = vaddq_u16(vaddq_u16(q0p0, q5p5), sum);
+
+  *p0q0_output = vrshrq_n_u16(sum, 4);
+}
+
+void aom_highbd_lpf_horizontal_14_neon(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  uint16_t *const dst_p6 = s - 7 * pitch;
+  uint16_t *const dst_p5 = s - 6 * pitch;
+  uint16_t *const dst_p4 = s - 5 * pitch;
+  uint16_t *const dst_p3 = s - 4 * pitch;
+  uint16_t *const dst_p2 = s - 3 * pitch;
+  uint16_t *const dst_p1 = s - 2 * pitch;
+  uint16_t *const dst_p0 = s - pitch;
+  uint16_t *const dst_q0 = s;
+  uint16_t *const dst_q1 = s + pitch;
+  uint16_t *const dst_q2 = s + 2 * pitch;
+  uint16_t *const dst_q3 = s + 3 * pitch;
+  uint16_t *const dst_q4 = s + 4 * pitch;
+  uint16_t *const dst_q5 = s + 5 * pitch;
+  uint16_t *const dst_q6 = s + 6 * pitch;
+
+  const uint16x4_t src[14] = {
+    vld1_u16(dst_p6), vld1_u16(dst_p5), vld1_u16(dst_p4), vld1_u16(dst_p3),
+    vld1_u16(dst_p2), vld1_u16(dst_p1), vld1_u16(dst_p0), vld1_u16(dst_q0),
+    vld1_u16(dst_q1), vld1_u16(dst_q2), vld1_u16(dst_q3), vld1_u16(dst_q4),
+    vld1_u16(dst_q5), vld1_u16(dst_q6)
+  };
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask =
+      outer_threshold(src[5], src[6], src[7], src[8], outer_thresh);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  const uint16x8_t p0q0 = vcombine_u16(src[6], src[7]);
+  const uint16x8_t p1q1 = vcombine_u16(src[5], src[8]);
+  const uint16x8_t p2q2 = vcombine_u16(src[4], src[9]);
+  const uint16x8_t p3q3 = vcombine_u16(src[3], src[10]);
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+  const uint16x8_t p4q4 = vcombine_u16(src[2], src[11]);
+  const uint16x8_t p5q5 = vcombine_u16(src[1], src[12]);
+  const uint16x8_t p6q6 = vcombine_u16(src[0], src[13]);
+  // Mask to choose between the outputs of filter8 and filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                              vabdq_u16(p0q0, p6q6), bd));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  // ZIP1 p0q0, p1q1 may perform better here.
+  const uint16x8_t p0q1 = vcombine_u16(src[6], src[8]);
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() and filter14() do not apply, but filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // filter14() does not apply, but filter8() and filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+
+  vst1_u16(dst_p5, vget_low_u16(p5q5_output));
+  vst1_u16(dst_p4, vget_low_u16(p4q4_output));
+  vst1_u16(dst_p3, vget_low_u16(p3q3_output));
+  vst1_u16(dst_p2, vget_low_u16(p2q2_output));
+  vst1_u16(dst_p1, vget_low_u16(p1q1_output));
+  vst1_u16(dst_p0, vget_low_u16(p0q0_output));
+  vst1_u16(dst_q0, vget_high_u16(p0q0_output));
+  vst1_u16(dst_q1, vget_high_u16(p1q1_output));
+  vst1_u16(dst_q2, vget_high_u16(p2q2_output));
+  vst1_u16(dst_q3, vget_high_u16(p3q3_output));
+  vst1_u16(dst_q4, vget_high_u16(p4q4_output));
+  vst1_u16(dst_q5, vget_high_u16(p5q5_output));
+}
+
+void aom_highbd_lpf_horizontal_14_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1, bd);
+}
+
+static INLINE uint16x8x2_t permute_acdb64(const uint16x8_t ab,
+                                          const uint16x8_t cd) {
+  uint16x8x2_t acdb;
+#if AOM_ARCH_AARCH64
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u16(ab), vreinterpretq_u64_u16(cd)));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u16(cd), vreinterpretq_u64_u16(ab)));
+#else
+  // a[b] <- [c]d
+  acdb.val[0] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 0),
+                     vreinterpretq_u64_u16(ab), 1));
+  // [a]b <- c[d]
+  acdb.val[1] = vreinterpretq_u16_u64(
+      vsetq_lane_u64(vgetq_lane_u64(vreinterpretq_u64_u16(cd), 1),
+                     vreinterpretq_u64_u16(ab), 0));
+#endif  // AOM_ARCH_AARCH64
+  return acdb;
+}
+
+void aom_highbd_lpf_vertical_14_neon(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  uint16_t *const dst = s - 8;
+  uint16_t *const dst_0 = dst;
+  uint16_t *const dst_1 = dst + pitch;
+  uint16_t *const dst_2 = dst + 2 * pitch;
+  uint16_t *const dst_3 = dst + 3 * pitch;
+
+  // Low halves:  p7 p6 p5 p4
+  // High halves: p3 p2 p1 p0
+  uint16x8_t src_p[4] = { vld1q_u16(dst_0), vld1q_u16(dst_1), vld1q_u16(dst_2),
+                          vld1q_u16(dst_3) };
+  // p7 will be the low half of src_p[0]. Not used until the end.
+  transpose_array_inplace_u16_4x8(src_p);
+
+  // Low halves:  q0 q1 q2 q3
+  // High halves: q4 q5 q6 q7
+  uint16x8_t src_q[4] = { vld1q_u16(dst_0 + 8), vld1q_u16(dst_1 + 8),
+                          vld1q_u16(dst_2 + 8), vld1q_u16(dst_3 + 8) };
+  // q7 will be the high half of src_q[3]. Not used until the end.
+  transpose_array_inplace_u16_4x8(src_q);
+
+  // Adjust thresholds to bitdepth.
+  const int outer_thresh = *blimit << (bd - 8);
+  const int inner_thresh = *limit << (bd - 8);
+  const int hev_thresh = *thresh << (bd - 8);
+  const uint16x4_t outer_mask = outer_threshold(
+      vget_high_u16(src_p[2]), vget_high_u16(src_p[3]), vget_low_u16(src_q[0]),
+      vget_low_u16(src_q[1]), outer_thresh);
+  const uint16x8_t p0q0 = vextq_u16(src_p[3], src_q[0], 4);
+  const uint16x8_t p1q1 = vextq_u16(src_p[2], src_q[1], 4);
+  const uint16x8_t p2q2 = vextq_u16(src_p[1], src_q[2], 4);
+  const uint16x8_t p3q3 = vextq_u16(src_p[0], src_q[3], 4);
+  uint16x4_t hev_mask;
+  uint16x4_t needs_filter_mask;
+  uint16x4_t is_flat4_mask;
+  filter8_masks(p3q3, p2q2, p1q1, p0q0, hev_thresh, outer_mask, inner_thresh,
+                bd, &needs_filter_mask, &is_flat4_mask, &hev_mask);
+
+#if AOM_ARCH_AARCH64
+  if (vaddv_u16(needs_filter_mask) == 0) {
+    // None of the values will be filtered.
+    return;
+  }
+#endif  // AOM_ARCH_AARCH64
+  const uint16x8_t p4q4 =
+      vcombine_u16(vget_low_u16(src_p[3]), vget_high_u16(src_q[0]));
+  const uint16x8_t p5q5 =
+      vcombine_u16(vget_low_u16(src_p[2]), vget_high_u16(src_q[1]));
+  const uint16x8_t p6q6 =
+      vcombine_u16(vget_low_u16(src_p[1]), vget_high_u16(src_q[2]));
+  const uint16x8_t p7q7 =
+      vcombine_u16(vget_low_u16(src_p[0]), vget_high_u16(src_q[3]));
+  // Mask to choose between the outputs of filter8 and filter14.
+  // As with the derivation of |is_flat4_mask|, the question of whether to use
+  // filter14 is only raised where |is_flat4_mask| is true.
+  const uint16x4_t is_flat4_outer_mask = vand_u16(
+      is_flat4_mask, is_flat4(vabdq_u16(p0q0, p4q4), vabdq_u16(p0q0, p5q5),
+                              vabdq_u16(p0q0, p6q6), bd));
+  // Copy the masks to the high bits for packed comparisons later.
+  const uint16x8_t hev_mask_8 = vcombine_u16(hev_mask, hev_mask);
+  const uint16x8_t needs_filter_mask_8 =
+      vcombine_u16(needs_filter_mask, needs_filter_mask);
+
+  uint16x8_t f4_p1q1;
+  uint16x8_t f4_p0q0;
+  const uint16x8_t p0q1 = vcombine_u16(vget_low_u16(p0q0), vget_high_u16(p1q1));
+  filter4(p0q0, p0q1, p1q1, hev_mask, bd, &f4_p1q1, &f4_p0q0);
+  f4_p1q1 = vbslq_u16(hev_mask_8, p1q1, f4_p1q1);
+
+  uint16x8_t p0q0_output, p1q1_output, p2q2_output, p3q3_output, p4q4_output,
+      p5q5_output;
+  // Because we did not return after testing |needs_filter_mask| we know it is
+  // nonzero. |is_flat4_mask| controls whether the needed filter is filter4 or
+  // filter8. Therefore if it is false when |needs_filter_mask| is true, filter8
+  // output is not used.
+  uint16x8_t f8_p2q2, f8_p1q1, f8_p0q0;
+  const uint64x1_t need_filter8 = vreinterpret_u64_u16(is_flat4_mask);
+  if (vget_lane_u64(need_filter8, 0) == 0) {
+    // filter8() and filter14() do not apply, but filter4() applies to one or
+    // more values.
+    p5q5_output = p5q5;
+    p4q4_output = p4q4;
+    p3q3_output = p3q3;
+    p2q2_output = p2q2;
+    p1q1_output = vbslq_u16(needs_filter_mask_8, f4_p1q1, p1q1);
+    p0q0_output = vbslq_u16(needs_filter_mask_8, f4_p0q0, p0q0);
+  } else {
+    const uint16x8_t use_filter8_mask =
+        vcombine_u16(is_flat4_mask, is_flat4_mask);
+    filter8(p3q3, p2q2, p1q1, p0q0, &f8_p2q2, &f8_p1q1, &f8_p0q0);
+    const uint64x1_t need_filter14 = vreinterpret_u64_u16(is_flat4_outer_mask);
+    if (vget_lane_u64(need_filter14, 0) == 0) {
+      // filter14() does not apply, but filter8() and filter4() apply to one or
+      // more values.
+      p5q5_output = p5q5;
+      p4q4_output = p4q4;
+      p3q3_output = p3q3;
+      p2q2_output = vbslq_u16(use_filter8_mask, f8_p2q2, p2q2);
+      p1q1_output = vbslq_u16(use_filter8_mask, f8_p1q1, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter8_mask, f8_p0q0, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    } else {
+      // All filters may contribute values to final outputs.
+      const uint16x8_t use_filter14_mask =
+          vcombine_u16(is_flat4_outer_mask, is_flat4_outer_mask);
+      uint16x8_t f14_p5q5, f14_p4q4, f14_p3q3, f14_p2q2, f14_p1q1, f14_p0q0;
+      filter14(p6q6, p5q5, p4q4, p3q3, p2q2, p1q1, p0q0, &f14_p5q5, &f14_p4q4,
+               &f14_p3q3, &f14_p2q2, &f14_p1q1, &f14_p0q0);
+      p5q5_output = vbslq_u16(use_filter14_mask, f14_p5q5, p5q5);
+      p4q4_output = vbslq_u16(use_filter14_mask, f14_p4q4, p4q4);
+      p3q3_output = vbslq_u16(use_filter14_mask, f14_p3q3, p3q3);
+      p2q2_output = vbslq_u16(use_filter14_mask, f14_p2q2, f8_p2q2);
+      p2q2_output = vbslq_u16(use_filter8_mask, p2q2_output, p2q2);
+      p2q2_output = vbslq_u16(needs_filter_mask_8, p2q2_output, p2q2);
+      p1q1_output = vbslq_u16(use_filter14_mask, f14_p1q1, f8_p1q1);
+      p1q1_output = vbslq_u16(use_filter8_mask, p1q1_output, f4_p1q1);
+      p1q1_output = vbslq_u16(needs_filter_mask_8, p1q1_output, p1q1);
+      p0q0_output = vbslq_u16(use_filter14_mask, f14_p0q0, f8_p0q0);
+      p0q0_output = vbslq_u16(use_filter8_mask, p0q0_output, f4_p0q0);
+      p0q0_output = vbslq_u16(needs_filter_mask_8, p0q0_output, p0q0);
+    }
+  }
+  // To get the correctly ordered rows from the transpose, we need:
+  // p7p3 p6p2 p5p1 p4p0
+  // q0q4 q1q5 q2q6 q3q7
+  const uint16x8x2_t p7p3_q3q7 = permute_acdb64(p7q7, p3q3_output);
+  const uint16x8x2_t p6p2_q2q6 = permute_acdb64(p6q6, p2q2_output);
+  const uint16x8x2_t p5p1_q1q5 = permute_acdb64(p5q5_output, p1q1_output);
+  const uint16x8x2_t p4p0_q0q4 = permute_acdb64(p4q4_output, p0q0_output);
+  uint16x8_t output_p[4] = { p7p3_q3q7.val[0], p6p2_q2q6.val[0],
+                             p5p1_q1q5.val[0], p4p0_q0q4.val[0] };
+  transpose_array_inplace_u16_4x8(output_p);
+  uint16x8_t output_q[4] = { p4p0_q0q4.val[1], p5p1_q1q5.val[1],
+                             p6p2_q2q6.val[1], p7p3_q3q7.val[1] };
+  transpose_array_inplace_u16_4x8(output_q);
+
+  // Reverse p values to produce original order:
+  // p3 p2 p1 p0 q0 q1 q2 q3
+  vst1q_u16(dst_0, output_p[0]);
+  vst1q_u16(dst_0 + 8, output_q[0]);
+  vst1q_u16(dst_1, output_p[1]);
+  vst1q_u16(dst_1 + 8, output_q[1]);
+  vst1q_u16(dst_2, output_p[2]);
+  vst1q_u16(dst_2 + 8, output_q[2]);
+  vst1q_u16(dst_3, output_p[3]);
+  vst1q_u16(dst_3 + 8, output_q[3]);
+}
+
+void aom_highbd_lpf_vertical_14_dual_neon(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1,
+                                  thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
new file mode 100644
index 0000000000..9262d818e9
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_masked_sad_neon.c
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad,
+                                             const uint16_t *src,
+                                             const uint16_t *a,
+                                             const uint16_t *b,
+                                             const uint8_t *m) {
+  const uint16x8_t s0 = vld1q_u16(src);
+  const uint16x8_t a0 = vld1q_u16(a);
+  const uint16x8_t b0 = vld1q_u16(b);
+  const uint16x8_t m0 = vmovl_u8(vld1_u8(m));
+
+  uint16x8_t blend_u16 = alpha_blend_a64_u16x8(m0, a0, b0);
+
+  return vaddq_u16(sad, vabdq_u16(blend_u16, s0));
+}
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_8x1_neon(sad, src, a, b, m);
+  return masked_sad_8x1_neon(sad, &src[8], &a[8], &b[8], &m[8]);
+}
+
+static INLINE uint16x8_t masked_sad_32x1_neon(uint16x8_t sad,
+                                              const uint16_t *src,
+                                              const uint16_t *a,
+                                              const uint16_t *b,
+                                              const uint8_t *m) {
+  sad = masked_sad_16x1_neon(sad, src, a, b, m);
+  return masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+}
+
+static INLINE unsigned int masked_sad_128xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                         vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+      sad[2] = masked_sad_32x1_neon(sad[2], &src[64], &a[64], &b[64], &m[64]);
+      sad[3] = masked_sad_32x1_neon(sad[3], &src[96], &a[96], &b[96], &m[96]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    sad_u32[2] = vpadalq_u16(sad_u32[2], sad[2]);
+    sad_u32[3] = vpadalq_u16(sad_u32[3], sad[3]);
+    height -= 4;
+  } while (height != 0);
+
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[1]);
+  sad_u32[2] = vaddq_u32(sad_u32[2], sad_u32[3]);
+  sad_u32[0] = vaddq_u32(sad_u32[0], sad_u32[2]);
+
+  return horizontal_add_u32x4(sad_u32[0]);
+}
+
+static INLINE unsigned int masked_sad_64xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32[] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+    for (int h = 0; h < 4; ++h) {
+      sad[0] = masked_sad_32x1_neon(sad[0], src, a, b, m);
+      sad[1] = masked_sad_32x1_neon(sad[1], &src[32], &a[32], &b[32], &m[32]);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32[0] = vpadalq_u16(sad_u32[0], sad[0]);
+    sad_u32[1] = vpadalq_u16(sad_u32[1], sad[1]);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sad_u32[0], sad_u32[1]));
+}
+
+static INLINE unsigned int masked_sad_32xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad = vdupq_n_u16(0);
+    for (int h = 0; h < 4; ++h) {
+      sad = masked_sad_32x1_neon(sad, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad);
+    height -= 4;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+static INLINE unsigned int masked_sad_16xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 8; ++h) {
+      sad_u16 = masked_sad_16x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 8;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE unsigned int masked_sad_8xh_large_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint32x4_t sad_u32 = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t sad_u16 = vdupq_n_u16(0);
+
+    for (int h = 0; h < 16; ++h) {
+      sad_u16 = masked_sad_8x1_neon(sad_u16, src, a, b, m);
+
+      src += src_stride;
+      a += a_stride;
+      b += b_stride;
+      m += m_stride;
+    }
+
+    sad_u32 = vpadalq_u16(sad_u32, sad_u16);
+    height -= 16;
+  } while (height != 0);
+
+  return horizontal_add_u32x4(sad_u32);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE unsigned int masked_sad_16xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 8 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 8);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_8xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 128 elements in the
+  // uint16x8_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_8x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned int masked_sad_4xh_small_neon(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m, int m_stride,
+    int height) {
+  // For 12-bit data, we can only accumulate up to 64 elements in the
+  // uint16x4_t type sad accumulator, so we can only process up to 16 rows
+  // before we have to accumulate into 32-bit elements.
+  assert(height <= 16);
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  uint16x4_t sad = vdup_n_u16(0);
+  do {
+    uint16x4_t m0 = vget_low_u16(vmovl_u8(load_unaligned_u8_4x1(m)));
+    uint16x4_t a0 = load_unaligned_u16_4x1(a);
+    uint16x4_t b0 = load_unaligned_u16_4x1(b);
+    uint16x4_t s0 = load_unaligned_u16_4x1(src);
+
+    uint16x4_t blend_u16 = alpha_blend_a64_u16x4(m0, a0, b0);
+
+    sad = vadd_u16(sad, vabd_u16(blend_u16, s0));
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u16x4(sad);
+}
+
+#define HIGHBD_MASKED_SAD_WXH_SMALL_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_small_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_small_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+#define HIGHBD_MASKED_SAD_WXH_LARGE_NEON(w, h)                                \
+  unsigned int aom_highbd_masked_sad##w##x##h##_neon(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##w##xh_large_neon(src, src_stride, ref, ref_stride,  \
+                                           second_pred, w, msk, msk_stride,   \
+                                           h);                                \
+    else                                                                      \
+      return masked_sad_##w##xh_large_neon(src, src_stride, second_pred, w,   \
+                                           ref, ref_stride, msk, msk_stride,  \
+                                           h);                                \
+  }
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 8)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 4)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 8)
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(8, 16)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 8)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 32)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 16)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 32)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 128)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 64)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(4, 16)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(8, 32)
+
+HIGHBD_MASKED_SAD_WXH_SMALL_NEON(16, 4)
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(16, 64)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(32, 8)
+
+HIGHBD_MASKED_SAD_WXH_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
new file mode 100644
index 0000000000..28699e6f41
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_sad_neon.c
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_sad_8x1_s16_neon(uint16x8_t ref,
+                                                const int32_t *mask,
+                                                const int32_t *wsrc,
+                                                uint32x4_t *sum) {
+  int16x8_t ref_s16 = vreinterpretq_s16_u16(ref);
+
+  int32x4_t wsrc_lo = vld1q_s32(wsrc);
+  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+  int32x4_t mask_lo = vld1q_s32(mask);
+  int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+  int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+  *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+  *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+static INLINE unsigned int highbd_obmc_sad_4xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height / 2;
+  do {
+    uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += 2 * ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_8xh_neon(const uint8_t *ref,
+                                                    int ref_stride,
+                                                    const int32_t *wsrc,
+                                                    const int32_t *mask,
+                                                    int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  do {
+    uint16x8_t r = vld1q_u16(ref_ptr);
+
+    highbd_obmc_sad_8x1_s16_neon(r, mask, wsrc, &sum);
+
+    ref_ptr += ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int highbd_obmc_sad_large_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int width, int height) {
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t r0 = vld1q_u16(ref_ptr + i);
+      highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+
+      uint16x8_t r1 = vld1q_u16(ref_ptr + i + 8);
+      highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+
+      wsrc += 16;
+      mask += 16;
+      i += 16;
+    } while (i < width);
+
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_16xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_32xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int height) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+
+  do {
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+    uint16x8_t r2 = vld1q_u16(ref_ptr + 16);
+    uint16x8_t r3 = vld1q_u16(ref_ptr + 24);
+
+    highbd_obmc_sad_8x1_s16_neon(r0, mask, wsrc, &sum[0]);
+    highbd_obmc_sad_8x1_s16_neon(r1, mask + 8, wsrc + 8, &sum[1]);
+    highbd_obmc_sad_8x1_s16_neon(r2, mask + 16, wsrc + 16, &sum[2]);
+    highbd_obmc_sad_8x1_s16_neon(r3, mask + 24, wsrc + 24, &sum[3]);
+
+    wsrc += 32;
+    mask += 32;
+    ref_ptr += ref_stride;
+  } while (--height != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[2]));
+}
+
+static INLINE unsigned int highbd_obmc_sad_64xh_neon(const uint8_t *ref,
+                                                     int ref_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int highbd_obmc_sad_128xh_neon(const uint8_t *ref,
+                                                      int ref_stride,
+                                                      const int32_t *wsrc,
+                                                      const int32_t *mask,
+                                                      int h) {
+  return highbd_obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+#define HIGHBD_OBMC_SAD_WXH_NEON(w, h)                                   \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_neon(                      \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc,           \
+      const int32_t *mask) {                                             \
+    return highbd_obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+  }
+
+HIGHBD_OBMC_SAD_WXH_NEON(4, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(4, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(8, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 8)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 16)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(32, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 32)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(64, 128)
+
+HIGHBD_OBMC_SAD_WXH_NEON(128, 64)
+HIGHBD_OBMC_SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_OBMC_SAD_WXH_NEON(4, 16)
+
+HIGHBD_OBMC_SAD_WXH_NEON(8, 32)
+
+HIGHBD_OBMC_SAD_WXH_NEON(16, 4)
+HIGHBD_OBMC_SAD_WXH_NEON(16, 64)
+
+HIGHBD_OBMC_SAD_WXH_NEON(32, 8)
+
+HIGHBD_OBMC_SAD_WXH_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
new file mode 100644
index 0000000000..d59224619b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_obmc_variance_neon.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_obmc_variance_8x1_s16_neon(uint16x8_t pre,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     uint32x4_t *sse,
+                                                     int32x4_t *sum) {
+  int16x8_t pre_s16 = vreinterpretq_s16_u16(pre);
+  int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+  int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+
+  int32x4_t mask_lo = vld1q_s32(&mask[0]);
+  int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+  int16x8_t mask_s16 = vcombine_s16(vmovn_s32(mask_lo), vmovn_s32(mask_hi));
+
+  int32x4_t diff_lo = vmull_s16(vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+  int32x4_t diff_hi =
+      vmull_s16(vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+  diff_lo = vsubq_s32(wsrc_lo, diff_lo);
+  diff_hi = vsubq_s32(wsrc_hi, diff_hi);
+
+  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+  // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+  // This difference only affects the bit patterns at the rounding breakpoints
+  // exactly, so we can add -1 to all negative numbers to move the breakpoint
+  // one value across and into the correct rounding region.
+  diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+  diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+  int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+  int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+  *sum = vaddq_s32(*sum, round_lo);
+  *sum = vaddq_s32(*sum, round_hi);
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_lo),
+                   vreinterpretq_u32_s32(round_lo));
+  *sse = vmlaq_u32(*sse, vreinterpretq_u32_s32(round_hi),
+                   vreinterpretq_u32_s32(round_hi));
+}
+
+// For 12-bit data, we can only accumulate up to 256 elements in the unsigned
+// 32-bit elements (4095*4095*256 = 4292870400) before we have to accumulate
+// into 64-bit elements. Therefore blocks of size 32x64, 64x32, 64x64, 64x128,
+// 128x64, 128x128 are processed in a different helper function.
+static INLINE void highbd_obmc_variance_xlarge_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, int h_limit, uint64_t *sse,
+    int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  do {
+    uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+    int j = 0;
+
+    do {
+      int i = 0;
+
+      do {
+        uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+        highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32[0],
+                                          &sum_s32);
+
+        uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+        highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32[1],
+                                          &sum_s32);
+
+        i += 16;
+        wsrc += 16;
+        mask += 16;
+      } while (i < width);
+
+      pre_ptr += pre_stride;
+      j++;
+    } while (j < h_tmp);
+
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[0]);
+    sse_u64 = vpadalq_u32(sse_u64, sse_u32[1]);
+    h -= h_tmp;
+  } while (h != 0);
+
+  *sse = horizontal_add_u64x2(sse_u64);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 128, h, 16, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_64xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 64, h, 32, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_xlarge_neon_32xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_xlarge_neon(pre, pre_stride, wsrc, mask, 32, h, 64, sse,
+                                   sum);
+}
+
+static INLINE void highbd_obmc_variance_large_neon(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int width, int h, uint64_t *sse, int64_t *sum) {
+  uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    int i = 0;
+    do {
+      uint16x8_t pre0 = vld1q_u16(pre_ptr + i);
+      highbd_obmc_variance_8x1_s16_neon(pre0, wsrc, mask, &sse_u32, &sum_s32);
+
+      uint16x8_t pre1 = vld1q_u16(pre_ptr + i + 8);
+      highbd_obmc_variance_8x1_s16_neon(pre1, wsrc + 8, mask + 8, &sse_u32,
+                                        &sum_s32);
+
+      i += 16;
+      wsrc += 16;
+      mask += 16;
+    } while (i < width);
+
+    pre_ptr += pre_stride;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_128xh(
+    const uint8_t *pre, int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, int h, uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse,
+                                  sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_64xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_32xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_16xh(const uint8_t *pre,
+                                                  int pre_stride,
+                                                  const int32_t *wsrc,
+                                                  const int32_t *mask, int h,
+                                                  uint64_t *sse, int64_t *sum) {
+  highbd_obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void highbd_obmc_variance_neon_8xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = vld1q_u16(pre);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += pre_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_obmc_variance_neon_4xh(const uint8_t *pre8,
+                                                 int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask, int h,
+                                                 uint64_t *sse, int64_t *sum) {
+  assert(h % 2 == 0);
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    uint16x8_t pre_u16 = load_unaligned_u16_4x2(pre, pre_stride);
+
+    highbd_obmc_variance_8x1_s16_neon(pre_u16, wsrc, mask, &sse_u32, &sum_s32);
+
+    pre += 2 * pre_stride;
+    wsrc += 8;
+    mask += 8;
+    h -= 2;
+  } while (h != 0);
+
+  *sse = horizontal_long_add_u32x4(sse_u32);
+  *sum = horizontal_long_add_s32x4(sum_s32);
+}
+
+static INLINE void highbd_8_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                               int *sum, unsigned int *sse) {
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance_cast(int64_t sum64, uint64_t sse64,
+                                                int *sum, unsigned int *sse) {
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VARIANCE_WXH_NEON(w, h, bitdepth)                         \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(         \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,                \
+      const int32_t *mask, unsigned int *sse) {                               \
+    int sum;                                                                  \
+    int64_t sum64;                                                            \
+    uint64_t sse64;                                                           \
+    highbd_obmc_variance_neon_##w##xh(pre, pre_stride, wsrc, mask, h, &sse64, \
+                                      &sum64);                                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);          \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));             \
+  }
+
+#define HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(w, h, bitdepth)                 \
+  unsigned int aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(        \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,               \
+      const int32_t *mask, unsigned int *sse) {                              \
+    int sum;                                                                 \
+    int64_t sum64;                                                           \
+    uint64_t sse64;                                                          \
+    highbd_obmc_variance_xlarge_neon_##w##xh(pre, pre_stride, wsrc, mask, h, \
+                                             &sse64, &sum64);                \
+    highbd_##bitdepth##_obmc_variance_cast(sum64, sse64, &sum, sse);         \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (w * h));            \
+  }
+
+// 8-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 8)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 8)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 8)
+
+// 10-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 64, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 32, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 128, 10)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 64, 10)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(128, 128, 10)
+
+// 12-bit
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(4, 16, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(8, 32, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 4, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(16, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 8, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_NEON(32, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(32, 64, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_NEON(64, 16, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 32, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(64, 128, 12)
+
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 64, 12)
+HIGHBD_OBMC_VARIANCE_WXH_XLARGE_NEON(128, 128, 12)
diff --git a/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
new file mode 100644
index 0000000000..6149c9f13e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_quantize_neon.c
@@ -0,0 +1,431 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/quantize.h"
+
+static INLINE uint32_t sum_abs_coeff(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint64x1_t c = vadd_u64(vget_low_u64(b), vget_high_u64(b));
+  return (uint32_t)vget_lane_u64(c, 0);
+#endif
+}
+
+static INLINE uint16x4_t
+quantize_4(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+           tran_low_t *dqcoeff_ptr, int32x4_t v_quant_s32,
+           int32x4_t v_dequant_s32, int32x4_t v_round_s32, int32x4_t v_zbin_s32,
+           int32x4_t v_quant_shift_s32, int log_scale) {
+  const int32x4_t v_coeff = vld1q_s32(coeff_ptr);
+  const int32x4_t v_coeff_sign =
+      vreinterpretq_s32_u32(vcltq_s32(v_coeff, vdupq_n_s32(0)));
+  const int32x4_t v_abs_coeff = vabsq_s32(v_coeff);
+  // if (abs_coeff < zbins[rc != 0]),
+  const uint32x4_t v_zbin_mask = vcgeq_s32(v_abs_coeff, v_zbin_s32);
+  const int32x4_t v_log_scale = vdupq_n_s32(log_scale);
+  // const int64_t tmp = (int64_t)abs_coeff + log_scaled_round;
+  const int32x4_t v_tmp = vaddq_s32(v_abs_coeff, v_round_s32);
+  //  const int32_t tmpw32 = tmp * wt;
+  const int32x4_t v_tmpw32 = vmulq_s32(v_tmp, vdupq_n_s32((1 << AOM_QM_BITS)));
+  //  const int32_t tmp2 = (int32_t)((tmpw32 * quant64) >> 16);
+  const int32x4_t v_tmp2 = vqdmulhq_s32(v_tmpw32, v_quant_s32);
+  // const int32_t tmp3 =
+  //    ((((tmp2 + tmpw32)<< log_scale) * (int64_t)(quant_shift << 15)) >> 32);
+  const int32x4_t v_tmp3 = vqdmulhq_s32(
+      vshlq_s32(vaddq_s32(v_tmp2, v_tmpw32), v_log_scale), v_quant_shift_s32);
+  // const int abs_qcoeff = vmask ? (int)tmp3 >> AOM_QM_BITS : 0;
+  const int32x4_t v_abs_qcoeff = vandq_s32(vreinterpretq_s32_u32(v_zbin_mask),
+                                           vshrq_n_s32(v_tmp3, AOM_QM_BITS));
+  // const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant_iwt) >> log_scale;
+  // vshlq_s32 will shift right if shift value is negative.
+  const int32x4_t v_abs_dqcoeff =
+      vshlq_s32(vmulq_s32(v_abs_qcoeff, v_dequant_s32), vnegq_s32(v_log_scale));
+  //  qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_qcoeff =
+      vsubq_s32(veorq_s32(v_abs_qcoeff, v_coeff_sign), v_coeff_sign);
+  //  dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+  const int32x4_t v_dqcoeff =
+      vsubq_s32(veorq_s32(v_abs_dqcoeff, v_coeff_sign), v_coeff_sign);
+
+  vst1q_s32(qcoeff_ptr, v_qcoeff);
+  vst1q_s32(dqcoeff_ptr, v_dqcoeff);
+
+  // Used to find eob.
+  const uint32x4_t nz_qcoeff_mask = vcgtq_s32(v_abs_qcoeff, vdupq_n_s32(0));
+  return vmovn_u32(nz_qcoeff_mask);
+}
+
+static INLINE int16x8_t get_max_lane_eob(const int16_t *iscan,
+                                         int16x8_t v_eobmax,
+                                         uint16x8_t v_mask) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
+  const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+  return vmaxq_s16(v_eobmax, v_nz_iscan);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void get_min_max_lane_eob(const int16_t *iscan,
+                                        int16x8_t *v_eobmin,
+                                        int16x8_t *v_eobmax, uint16x8_t v_mask,
+                                        intptr_t n_coeffs) {
+  const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
+  const int16x8_t v_nz_iscan_max = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
+#if SKIP_EOB_FACTOR_ADJUST
+  const int16x8_t v_nz_iscan_min =
+      vbslq_s16(v_mask, v_iscan, vdupq_n_s16((int16_t)n_coeffs));
+  *v_eobmin = vminq_s16(*v_eobmin, v_nz_iscan_min);
+#else
+  (void)v_eobmin;
+#endif
+  *v_eobmax = vmaxq_s16(*v_eobmax, v_nz_iscan_max);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
+#if AOM_ARCH_AARCH64
+  return (uint16_t)vmaxvq_s16(v_eobmax);
+#else
+  const int16x4_t v_eobmax_3210 =
+      vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
+  const int64x1_t v_eobmax_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+  const int16x4_t v_eobmax_tmp =
+      vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+  const int64x1_t v_eobmax_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+  const int16x4_t v_eobmax_final =
+      vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+#endif
+}
+
+#if SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+static INLINE uint16_t get_min_eob(int16x8_t v_eobmin) {
+#if AOM_ARCH_AARCH64
+  return (uint16_t)vminvq_s16(v_eobmin);
+#else
+  const int16x4_t v_eobmin_3210 =
+      vmin_s16(vget_low_s16(v_eobmin), vget_high_s16(v_eobmin));
+  const int64x1_t v_eobmin_xx32 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmin_3210), 32);
+  const int16x4_t v_eobmin_tmp =
+      vmin_s16(v_eobmin_3210, vreinterpret_s16_s64(v_eobmin_xx32));
+  const int64x1_t v_eobmin_xxx3 =
+      vshr_n_s64(vreinterpret_s64_s16(v_eobmin_tmp), 16);
+  const int16x4_t v_eobmin_final =
+      vmin_s16(v_eobmin_tmp, vreinterpret_s16_s64(v_eobmin_xxx3));
+  return (uint16_t)vget_lane_s16(v_eobmin_final, 0);
+#endif
+}
+#endif  // SKIP_EOB_FACTOR_ADJUST && !CONFIG_REALTIME_ONLY
+
+static void highbd_quantize_b_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const int log_scale) {
+  (void)scan;
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+  const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+  const int16x4_t v_round_log_scale =
+      vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_round =
+      vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+  const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+  const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+  const int16x4_t v_zbin_log_scale =
+      vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_zbin =
+      vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+  int32x4_t v_round_s32 = vmovl_s16(v_round);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+  int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+  int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+  int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+
+  intptr_t non_zero_count = n_coeffs;
+
+  assert(n_coeffs > 8);
+  // Pre-scan pass
+  const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+  intptr_t i = n_coeffs;
+  do {
+    const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+    const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+    const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+    const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+    const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_s32x);
+    const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_s32x);
+    // If the coefficient is in the base ZBIN range, then discard.
+    if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+      non_zero_count -= 8;
+    } else {
+      break;
+    }
+    i -= 8;
+  } while (i > 0);
+
+  const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+  memset(qcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+  // DC and first 3 AC
+  v_mask_lo =
+      quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+                 v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+  v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+  v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+  // 4 more AC
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                         v_quant_shift_s32, log_scale);
+
+  v_eobmax =
+      get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+
+  intptr_t count = non_zero_count - 8;
+  for (; count > 0; count -= 8) {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                           v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+    // Find the max lane eob for 8 coeffs.
+    v_eobmax =
+        get_max_lane_eob(iscan, v_eobmax, vcombine_u16(v_mask_lo, v_mask_hi));
+  }
+
+  *eob_ptr = get_max_eob(v_eobmax);
+}
+
+void aom_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                         quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                         eob_ptr, scan, iscan, 2);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static void highbd_quantize_b_adaptive_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const int log_scale) {
+  (void)scan;
+  const int16x4_t v_quant = vld1_s16(quant_ptr);
+  const int16x4_t v_dequant = vld1_s16(dequant_ptr);
+  const int16x4_t v_zero = vdup_n_s16(0);
+  const uint16x4_t v_round_select = vcgt_s16(vdup_n_s16(log_scale), v_zero);
+  const int16x4_t v_round_no_scale = vld1_s16(round_ptr);
+  const int16x4_t v_round_log_scale =
+      vqrdmulh_n_s16(v_round_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_round =
+      vbsl_s16(v_round_select, v_round_log_scale, v_round_no_scale);
+  const int16x4_t v_quant_shift = vld1_s16(quant_shift_ptr);
+  const int16x4_t v_zbin_no_scale = vld1_s16(zbin_ptr);
+  const int16x4_t v_zbin_log_scale =
+      vqrdmulh_n_s16(v_zbin_no_scale, (int16_t)(1 << (15 - log_scale)));
+  const int16x4_t v_zbin =
+      vbsl_s16(v_round_select, v_zbin_log_scale, v_zbin_no_scale);
+  int32x4_t v_round_s32 = vmovl_s16(v_round);
+  int32x4_t v_quant_s32 = vshlq_n_s32(vmovl_s16(v_quant), 15);
+  int32x4_t v_dequant_s32 = vmovl_s16(v_dequant);
+  int32x4_t v_quant_shift_s32 = vshlq_n_s32(vmovl_s16(v_quant_shift), 15);
+  int32x4_t v_zbin_s32 = vmovl_s16(v_zbin);
+  uint16x4_t v_mask_lo, v_mask_hi;
+  int16x8_t v_eobmax = vdupq_n_s16(-1);
+  int16x8_t v_eobmin = vdupq_n_s16((int16_t)n_coeffs);
+
+  assert(n_coeffs > 8);
+  // Pre-scan pass
+  const int32x4_t v_zbin_s32x = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+  const int prescan_add_1 =
+      ROUND_POWER_OF_TWO(dequant_ptr[1] * EOB_FACTOR, 7 + AOM_QM_BITS);
+  const int32x4_t v_zbin_prescan =
+      vaddq_s32(v_zbin_s32x, vdupq_n_s32(prescan_add_1));
+  intptr_t non_zero_count = n_coeffs;
+  intptr_t i = n_coeffs;
+  do {
+    const int32x4_t v_coeff_a = vld1q_s32(coeff_ptr + i - 4);
+    const int32x4_t v_coeff_b = vld1q_s32(coeff_ptr + i - 8);
+    const int32x4_t v_abs_coeff_a = vabsq_s32(v_coeff_a);
+    const int32x4_t v_abs_coeff_b = vabsq_s32(v_coeff_b);
+    const uint32x4_t v_mask_a = vcgeq_s32(v_abs_coeff_a, v_zbin_prescan);
+    const uint32x4_t v_mask_b = vcgeq_s32(v_abs_coeff_b, v_zbin_prescan);
+    // If the coefficient is in the base ZBIN range, then discard.
+    if (sum_abs_coeff(v_mask_a) + sum_abs_coeff(v_mask_b) == 0) {
+      non_zero_count -= 8;
+    } else {
+      break;
+    }
+    i -= 8;
+  } while (i > 0);
+
+  const intptr_t remaining_zcoeffs = n_coeffs - non_zero_count;
+  memset(qcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr + non_zero_count, 0,
+         remaining_zcoeffs * sizeof(*dqcoeff_ptr));
+
+  // DC and first 3 AC
+  v_mask_lo =
+      quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32, v_dequant_s32,
+                 v_round_s32, v_zbin_s32, v_quant_shift_s32, log_scale);
+
+  // overwrite the DC constants with AC constants
+  v_round_s32 = vdupq_lane_s32(vget_low_s32(v_round_s32), 1);
+  v_quant_s32 = vdupq_lane_s32(vget_low_s32(v_quant_s32), 1);
+  v_dequant_s32 = vdupq_lane_s32(vget_low_s32(v_dequant_s32), 1);
+  v_quant_shift_s32 = vdupq_lane_s32(vget_low_s32(v_quant_shift_s32), 1);
+  v_zbin_s32 = vdupq_lane_s32(vget_low_s32(v_zbin_s32), 1);
+
+  // 4 more AC
+  v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                         v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                         v_quant_shift_s32, log_scale);
+
+  get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+                       vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+
+  intptr_t count = non_zero_count - 8;
+  for (; count > 0; count -= 8) {
+    coeff_ptr += 8;
+    qcoeff_ptr += 8;
+    dqcoeff_ptr += 8;
+    iscan += 8;
+    v_mask_lo = quantize_4(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, v_quant_s32,
+                           v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+    v_mask_hi = quantize_4(coeff_ptr + 4, qcoeff_ptr + 4, dqcoeff_ptr + 4,
+                           v_quant_s32, v_dequant_s32, v_round_s32, v_zbin_s32,
+                           v_quant_shift_s32, log_scale);
+
+    get_min_max_lane_eob(iscan, &v_eobmin, &v_eobmax,
+                         vcombine_u16(v_mask_lo, v_mask_hi), n_coeffs);
+  }
+
+  int eob = get_max_eob(v_eobmax);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  const int first = get_min_eob(v_eobmin);
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                             ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+      const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+      const qm_val_t wt = (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_adaptive_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_neon(
+      coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+      qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_neon(
+      coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+      qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_neon(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  highbd_quantize_b_adaptive_neon(
+      coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr, quant_shift_ptr,
+      qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 2);
+}
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sad_neon.c b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
new file mode 100644
index 0000000000..d51f639de6
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sad_neon.c
@@ -0,0 +1,509 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE uint32_t highbd_sad4xh_small_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    sum = vabal_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_small_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    sum = vabaq_u16(sum, s, r);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE uint32_t highbd_sad8xh_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t sum_u16 = vabdq_u16(s, r);
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad16xh_large_neon(const uint8_t *src_ptr,
+                                                 int src_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr);
+    uint16x8_t r0 = vld1q_u16(ref16_ptr);
+    uint16x8_t diff0 = vabdq_u16(s0, r0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + 8);
+    uint16x8_t r1 = vld1q_u16(ref16_ptr + 8);
+    uint16x8_t diff1 = vabdq_u16(s1, r1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + j);
+      uint16x8_t r0 = vld1q_u16(ref16_ptr + j);
+      uint16x8_t diff0 = vabdq_u16(s0, r0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + j + 8);
+      uint16x8_t r1 = vld1q_u16(ref16_ptr + j + 8);
+      uint16x8_t diff1 = vabdq_u16(s1, r1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + j + 16);
+      uint16x8_t r2 = vld1q_u16(ref16_ptr + j + 16);
+      uint16x8_t diff2 = vabdq_u16(s2, r2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + j + 24);
+      uint16x8_t r3 = vld1q_u16(ref16_ptr + j + 24);
+      uint16x8_t diff3 = vabdq_u16(s3, r3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_large_neon(const uint8_t *src_ptr,
+                                                      int src_stride,
+                                                      const uint8_t *ref_ptr,
+                                                      int ref_stride, int h) {
+  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                  h);
+}
+
+static INLINE unsigned int highbd_sad64xh_large_neon(const uint8_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint8_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  h);
+}
+
+static INLINE unsigned int highbd_sad32xh_large_neon(const uint8_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint8_t *ref_ptr,
+                                                     int ref_stride, int h) {
+  return highbd_sadwxh_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                  h);
+}
+
+#define HBD_SAD_WXH_SMALL_NEON(w, h)                                      \
+  unsigned int aom_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_small_neon(src, src_stride, ref, ref_stride, \
+                                        (h));                             \
+  }
+
+#define HBD_SAD_WXH_LARGE_NEON(w, h)                                      \
+  unsigned int aom_highbd_sad##w##x##h##_neon(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,             \
+      int ref_stride) {                                                   \
+    return highbd_sad##w##xh_large_neon(src, src_stride, ref, ref_stride, \
+                                        (h));                             \
+  }
+
+HBD_SAD_WXH_SMALL_NEON(4, 4)
+HBD_SAD_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_SMALL_NEON(8, 4)
+HBD_SAD_WXH_SMALL_NEON(8, 8)
+HBD_SAD_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_LARGE_NEON(16, 8)
+HBD_SAD_WXH_LARGE_NEON(16, 16)
+HBD_SAD_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_LARGE_NEON(32, 16)
+HBD_SAD_WXH_LARGE_NEON(32, 32)
+HBD_SAD_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_LARGE_NEON(64, 32)
+HBD_SAD_WXH_LARGE_NEON(64, 64)
+HBD_SAD_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_LARGE_NEON(128, 64)
+HBD_SAD_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_LARGE_NEON(16, 4)
+HBD_SAD_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_SMALL_NEON(w, h)                             \
+  unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,         \
+      int ref_stride) {                                               \
+    return 2 * highbd_sad##w##xh_small_neon(src, 2 * src_stride, ref, \
+                                            2 * ref_stride, (h) / 2); \
+  }
+
+#define HBD_SAD_SKIP_WXH_LARGE_NEON(w, h)                             \
+  unsigned int aom_highbd_sad_skip_##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,         \
+      int ref_stride) {                                               \
+    return 2 * highbd_sad##w##xh_large_neon(src, 2 * src_stride, ref, \
+                                            2 * ref_stride, (h) / 2); \
+  }
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_sad4xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr);
+    uint16x4_t r = vld1_u16(ref16_ptr);
+    uint16x4_t p = vld1_u16(pred16_ptr);
+
+    uint16x4_t avg = vrhadd_u16(r, p);
+    sum = vabal_u16(sum, s, avg);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 4;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad8xh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr);
+    uint16x8_t r = vld1q_u16(ref16_ptr);
+    uint16x8_t p = vld1q_u16(pred16_ptr);
+
+    uint16x8_t avg = vrhaddq_u16(r, p);
+    uint16x8_t diff = vabdq_u16(s, avg);
+    sum = vpadalq_u16(sum, diff);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE uint32_t highbd_sad16xh_avg_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int h,
+                                               const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint16x8_t s0, s1, r0, r1, p0, p1;
+    uint16x8_t avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u16(src16_ptr);
+    r0 = vld1q_u16(ref16_ptr);
+    p0 = vld1q_u16(pred16_ptr);
+    avg0 = vrhaddq_u16(r0, p0);
+    diff0 = vabdq_u16(s0, avg0);
+    sum[0] = vpadalq_u16(sum[0], diff0);
+
+    s1 = vld1q_u16(src16_ptr + 8);
+    r1 = vld1q_u16(ref16_ptr + 8);
+    p1 = vld1q_u16(pred16_ptr + 8);
+    avg1 = vrhaddq_u16(r1, p1);
+    diff1 = vabdq_u16(s1, avg1);
+    sum[1] = vpadalq_u16(sum[1], diff1);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE uint32_t highbd_sadwxh_avg_neon(const uint8_t *src_ptr,
+                                              int src_stride,
+                                              const uint8_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              const uint8_t *second_pred) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr = CONVERT_TO_SHORTPTR(ref_ptr);
+  const uint16_t *pred16_ptr = CONVERT_TO_SHORTPTR(second_pred);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+      uint16x8_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+      s0 = vld1q_u16(src16_ptr + j);
+      r0 = vld1q_u16(ref16_ptr + j);
+      p0 = vld1q_u16(pred16_ptr + j);
+      avg0 = vrhaddq_u16(r0, p0);
+      diff0 = vabdq_u16(s0, avg0);
+      sum[0] = vpadalq_u16(sum[0], diff0);
+
+      s1 = vld1q_u16(src16_ptr + j + 8);
+      r1 = vld1q_u16(ref16_ptr + j + 8);
+      p1 = vld1q_u16(pred16_ptr + j + 8);
+      avg1 = vrhaddq_u16(r1, p1);
+      diff1 = vabdq_u16(s1, avg1);
+      sum[1] = vpadalq_u16(sum[1], diff1);
+
+      s2 = vld1q_u16(src16_ptr + j + 16);
+      r2 = vld1q_u16(ref16_ptr + j + 16);
+      p2 = vld1q_u16(pred16_ptr + j + 16);
+      avg2 = vrhaddq_u16(r2, p2);
+      diff2 = vabdq_u16(s2, avg2);
+      sum[2] = vpadalq_u16(sum[2], diff2);
+
+      s3 = vld1q_u16(src16_ptr + j + 24);
+      r3 = vld1q_u16(ref16_ptr + j + 24);
+      p3 = vld1q_u16(pred16_ptr + j + 24);
+      avg3 = vrhaddq_u16(r3, p3);
+      diff3 = vabdq_u16(s3, avg3);
+      sum[3] = vpadalq_u16(sum[3], diff3);
+
+      j += 32;
+    } while (j < w);
+
+    src16_ptr += src_stride;
+    ref16_ptr += ref_stride;
+    pred16_ptr += w;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int highbd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                h, second_pred);
+}
+
+static INLINE unsigned int highbd_sad64xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 64, h,
+                                second_pred);
+}
+
+static INLINE unsigned int highbd_sad32xh_avg_neon(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int h,
+                                                   const uint8_t *second_pred) {
+  return highbd_sadwxh_avg_neon(src_ptr, src_stride, ref_ptr, ref_stride, 32, h,
+                                second_pred);
+}
+
+#define HBD_SAD_WXH_AVG_NEON(w, h)                                            \
+  uint32_t aom_highbd_sad##w##x##h##_avg_neon(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return highbd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),  \
+                                      second_pred);                           \
+  }
+
+HBD_SAD_WXH_AVG_NEON(4, 4)
+HBD_SAD_WXH_AVG_NEON(4, 8)
+
+HBD_SAD_WXH_AVG_NEON(8, 4)
+HBD_SAD_WXH_AVG_NEON(8, 8)
+HBD_SAD_WXH_AVG_NEON(8, 16)
+
+HBD_SAD_WXH_AVG_NEON(16, 8)
+HBD_SAD_WXH_AVG_NEON(16, 16)
+HBD_SAD_WXH_AVG_NEON(16, 32)
+
+HBD_SAD_WXH_AVG_NEON(32, 16)
+HBD_SAD_WXH_AVG_NEON(32, 32)
+HBD_SAD_WXH_AVG_NEON(32, 64)
+
+HBD_SAD_WXH_AVG_NEON(64, 32)
+HBD_SAD_WXH_AVG_NEON(64, 64)
+HBD_SAD_WXH_AVG_NEON(64, 128)
+
+HBD_SAD_WXH_AVG_NEON(128, 64)
+HBD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_AVG_NEON(4, 16)
+
+HBD_SAD_WXH_AVG_NEON(8, 32)
+
+HBD_SAD_WXH_AVG_NEON(16, 4)
+HBD_SAD_WXH_AVG_NEON(16, 64)
+
+HBD_SAD_WXH_AVG_NEON(32, 8)
+
+HBD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
new file mode 100644
index 0000000000..85ca6732a8
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sadxd_neon.c
@@ -0,0 +1,617 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sad4xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+    uint16x4_t r3 = vld1_u16(ref16_ptr3 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+    sum[3] = vabal_u16(sum[3], s, r3);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad8xhx4d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+    sum[3] = vabaq_u16(sum[3], s, vld1q_u16(ref16_ptr3 + i * ref_stride));
+
+  } while (++i < h);
+
+  sum_u32[0] = vpaddlq_u16(sum[0]);
+  sum_u32[1] = vpaddlq_u16(sum[1]);
+  sum_u32[2] = vpaddlq_u16(sum[2]);
+  sum_u32[3] = vpaddlq_u16(sum[3]);
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8_neon(uint16x8_t src, uint16x8_t ref,
+                             uint32x4_t *const sad_sum) {
+  uint16x8_t abs_diff = vabdq_u16(src, ref);
+  *sad_sum = vpadalq_u16(*sad_sum, abs_diff);
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum[0]);
+    sad8_neon(s, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum[1]);
+    sad8_neon(s, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum[2]);
+    sad8_neon(s, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum[3]);
+
+  } while (++i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride), &sum_lo[3]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + 8), &sum_hi[3]);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sadwxhx4d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+  const uint16_t *ref16_ptr3 = CONVERT_TO_SHORTPTR(ref_ptr[3]);
+
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr3 + i * ref_stride + j), &sum_lo[3]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 8), &sum_hi[3]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 16),
+                &sum_lo[3]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr3 + i * ref_stride + j + 24),
+                &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void highbd_sad128xhx4d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx4d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx4d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_4D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_4D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x4d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_4D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_SAD_SKIP_WXH_4D_SMALL_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_small_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+#define HBD_SAD_SKIP_WXH_4D_LARGE_NEON(w, h)                                 \
+  void aom_highbd_sad_skip_##w##x##h##x4d_neon(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx4d_large_neon(src, 2 * src_stride, ref_array,          \
+                                    2 * ref_stride, sad_array, ((h) >> 1));  \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 8)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 4)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 8)
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 16)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 8)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 16)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 32)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 128)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 64)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(4, 16)
+
+HBD_SAD_SKIP_WXH_4D_SMALL_NEON(8, 32)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 4)
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(16, 64)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(32, 8)
+
+HBD_SAD_SKIP_WXH_4D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad4xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x4_t s = vld1_u16(src16_ptr + i * src_stride);
+    uint16x4_t r0 = vld1_u16(ref16_ptr0 + i * ref_stride);
+    uint16x4_t r1 = vld1_u16(ref16_ptr1 + i * ref_stride);
+    uint16x4_t r2 = vld1_u16(ref16_ptr2 + i * ref_stride);
+
+    sum[0] = vabal_u16(sum[0], s, r0);
+    sum[1] = vabal_u16(sum[1], s, r1);
+    sum[2] = vabal_u16(sum[2], s, r2);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad8xhx3d_small_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+
+    sum[0] = vabaq_u16(sum[0], s, vld1q_u16(ref16_ptr0 + i * ref_stride));
+    sum[1] = vabaq_u16(sum[1], s, vld1q_u16(ref16_ptr1 + i * ref_stride));
+    sum[2] = vabaq_u16(sum[2], s, vld1q_u16(ref16_ptr2 + i * ref_stride));
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vpaddlq_u16(sum[0]));
+  res[1] = horizontal_add_u32x4(vpaddlq_u16(sum[1]));
+  res[2] = horizontal_add_u32x4(vpaddlq_u16(sum[2]));
+}
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void highbd_sad8xhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s = vld1q_u16(src16_ptr + i * src_stride);
+    uint16x8_t r0 = vld1q_u16(ref16_ptr0 + i * ref_stride);
+    uint16x8_t r1 = vld1q_u16(ref16_ptr1 + i * ref_stride);
+    uint16x8_t r2 = vld1q_u16(ref16_ptr2 + i * ref_stride);
+
+    sad8_neon(s, r0, &sum[0]);
+    sad8_neon(s, r1, &sum[1]);
+    sad8_neon(s, r2, &sum[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE void highbd_sad16xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = 0;
+  do {
+    uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride);
+    sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride), &sum_lo[0]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride), &sum_lo[1]);
+    sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride), &sum_lo[2]);
+
+    uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + 8);
+    sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + 8), &sum_hi[0]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + 8), &sum_hi[1]);
+    sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + 8), &sum_hi[2]);
+
+  } while (++i < h);
+
+  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void highbd_sadwxhx3d_large_neon(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *const ref_ptr[4],
+                                               int ref_stride, uint32_t res[4],
+                                               int w, int h) {
+  const uint16_t *src16_ptr = CONVERT_TO_SHORTPTR(src_ptr);
+  const uint16_t *ref16_ptr0 = CONVERT_TO_SHORTPTR(ref_ptr[0]);
+  const uint16_t *ref16_ptr1 = CONVERT_TO_SHORTPTR(ref_ptr[1]);
+  const uint16_t *ref16_ptr2 = CONVERT_TO_SHORTPTR(ref_ptr[2]);
+
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum[3];
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src16_ptr + i * src_stride + j);
+      sad8_neon(s0, vld1q_u16(ref16_ptr0 + i * ref_stride + j), &sum_lo[0]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr1 + i * ref_stride + j), &sum_lo[1]);
+      sad8_neon(s0, vld1q_u16(ref16_ptr2 + i * ref_stride + j), &sum_lo[2]);
+
+      uint16x8_t s1 = vld1q_u16(src16_ptr + i * src_stride + j + 8);
+      sad8_neon(s1, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 8), &sum_hi[0]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 8), &sum_hi[1]);
+      sad8_neon(s1, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 8), &sum_hi[2]);
+
+      uint16x8_t s2 = vld1q_u16(src16_ptr + i * src_stride + j + 16);
+      sad8_neon(s2, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 16),
+                &sum_lo[0]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 16),
+                &sum_lo[1]);
+      sad8_neon(s2, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 16),
+                &sum_lo[2]);
+
+      uint16x8_t s3 = vld1q_u16(src16_ptr + i * src_stride + j + 24);
+      sad8_neon(s3, vld1q_u16(ref16_ptr0 + i * ref_stride + j + 24),
+                &sum_hi[0]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr1 + i * ref_stride + j + 24),
+                &sum_hi[1]);
+      sad8_neon(s3, vld1q_u16(ref16_ptr2 + i * ref_stride + j + 24),
+                &sum_hi[2]);
+
+      j += 32;
+    } while (j < w);
+
+  } while (++i < h);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void highbd_sad128xhx3d_large_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_ptr[4],
+    int ref_stride, uint32_t res[4], int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res,
+                              128, h);
+}
+
+static INLINE void highbd_sad64xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 64,
+                              h);
+}
+
+static INLINE void highbd_sad32xhx3d_large_neon(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *const ref_ptr[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int h) {
+  highbd_sadwxhx3d_large_neon(src_ptr, src_stride, ref_ptr, ref_stride, res, 32,
+                              h);
+}
+
+#define HBD_SAD_WXH_3D_SMALL_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_small_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+#define HBD_SAD_WXH_3D_LARGE_NEON(w, h)                                      \
+  void aom_highbd_sad##w##x##h##x3d_neon(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    highbd_sad##w##xhx3d_large_neon(src, src_stride, ref_array, ref_stride,  \
+                                    sad_array, (h));                         \
+  }
+
+HBD_SAD_WXH_3D_SMALL_NEON(4, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(4, 8)
+
+HBD_SAD_WXH_3D_SMALL_NEON(8, 4)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 8)
+HBD_SAD_WXH_3D_SMALL_NEON(8, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 8)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 16)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(32, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 32)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(64, 128)
+
+HBD_SAD_WXH_3D_LARGE_NEON(128, 64)
+HBD_SAD_WXH_3D_LARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SAD_WXH_3D_SMALL_NEON(4, 16)
+
+HBD_SAD_WXH_3D_LARGE_NEON(8, 32)
+
+HBD_SAD_WXH_3D_LARGE_NEON(16, 4)
+HBD_SAD_WXH_3D_LARGE_NEON(16, 64)
+
+HBD_SAD_WXH_3D_LARGE_NEON(32, 8)
+
+HBD_SAD_WXH_3D_LARGE_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_neon.c b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
new file mode 100644
index 0000000000..184e9f9bef
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_neon.c
@@ -0,0 +1,284 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void highbd_sse_8x1_init_neon(const uint16_t *src,
+                                            const uint16_t *ref,
+                                            uint32x4_t *sse_acc0,
+                                            uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmull_u16(abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmull_u16(abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint32x4_t *sse_acc0,
+                                       uint32x4_t *sse_acc1) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+  uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+  uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+  *sse_acc0 = vmlal_u16(*sse_acc0, abs_diff_lo, abs_diff_lo);
+  *sse_acc1 = vmlal_u16(*sse_acc1, abs_diff_hi, abs_diff_hi);
+}
+
+static INLINE int64_t highbd_sse_128xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int height) {
+  uint32x4_t sse[16];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_init_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_init_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_init_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_init_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+  highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+  highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+  highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+  highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[14], &sse[15]);
+    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[8], &sse[9]);
+    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[10], &sse[11]);
+    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[12], &sse[13]);
+    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[14], &sse[15]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x16(sse);
+}
+
+static INLINE int64_t highbd_sse_64xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+  highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_32xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[8];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+  highbd_sse_8x1_init_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+  highbd_sse_8x1_init_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[4], &sse[5]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[6], &sse[7]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x8(sse);
+}
+
+static INLINE int64_t highbd_sse_16xh_neon(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint32x4_t sse[4];
+  highbd_sse_8x1_init_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+  highbd_sse_8x1_init_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0], &sse[1]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[2], &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x4(sse);
+}
+
+static INLINE int64_t highbd_sse_8xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint32x4_t sse[2];
+  highbd_sse_8x1_init_neon(src, ref, &sse[0], &sse[1]);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    highbd_sse_8x1_neon(src, ref, &sse[0], &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4_x2(sse);
+}
+
+static INLINE int64_t highbd_sse_4xh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  // Peel the first loop iteration.
+  uint16x4_t s = vld1_u16(src);
+  uint16x4_t r = vld1_u16(ref);
+
+  uint16x4_t abs_diff = vabd_u16(s, r);
+  uint32x4_t sse = vmull_u16(abs_diff, abs_diff);
+
+  src += src_stride;
+  ref += ref_stride;
+
+  while (--height != 0) {
+    s = vld1_u16(src);
+    r = vld1_u16(ref);
+
+    abs_diff = vabd_u16(s, r);
+    sse = vmlal_u16(sse, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  return horizontal_long_add_u32x4(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_neon(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int width, int height) {
+  // { 0, 1, 2, 3, 4, 5, 6, 7 }
+  uint16x8_t k01234567 = vmovl_u8(vcreate_u8(0x0706050403020100));
+  uint16x8_t remainder_mask = vcltq_u16(k01234567, vdupq_n_u16(width & 7));
+  uint64_t sse = 0;
+
+  do {
+    int w = width;
+    int offset = 0;
+
+    do {
+      uint16x8_t s = vld1q_u16(src + offset);
+      uint16x8_t r = vld1q_u16(ref + offset);
+
+      if (w < 8) {
+        // Mask out-of-range elements.
+        s = vandq_u16(s, remainder_mask);
+        r = vandq_u16(r, remainder_mask);
+      }
+
+      uint16x8_t abs_diff = vabdq_u16(s, r);
+      uint16x4_t abs_diff_lo = vget_low_u16(abs_diff);
+      uint16x4_t abs_diff_hi = vget_high_u16(abs_diff);
+
+      uint32x4_t sse_u32 = vmull_u16(abs_diff_lo, abs_diff_lo);
+      sse_u32 = vmlal_u16(sse_u32, abs_diff_hi, abs_diff_hi);
+
+      sse += horizontal_long_add_u32x4(sse_u32);
+
+      offset += 8;
+      w -= 8;
+    } while (w > 0);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return sse;
+}
+
+int64_t aom_highbd_sse_neon(const uint8_t *src8, int src_stride,
+                            const uint8_t *ref8, int ref_stride, int width,
+                            int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4:
+      return highbd_sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return highbd_sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return highbd_sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_neon(src, src_stride, ref, ref_stride, width,
+                                 height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_sse_sve.c b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
new file mode 100644
index 0000000000..b267da5cfb
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_sse_sve.c
@@ -0,0 +1,215 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref,
+                                       uint64x2_t *sse) {
+  uint16x8_t s = vld1q_u16(src);
+  uint16x8_t r = vld1q_u16(ref);
+
+  uint16x8_t abs_diff = vabdq_u16(s, r);
+
+  *sse = aom_udotq_u16(*sse, abs_diff, abs_diff);
+}
+
+static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride,
+                                           const uint16_t *ref, int ref_stride,
+                                           int height) {
+  uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+                        vdupq_n_u64(0) };
+
+  do {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+    highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]);
+    highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  sse[0] = vaddq_u64(sse[0], sse[1]);
+  sse[2] = vaddq_u64(sse[2], sse[3]);
+  sse[0] = vaddq_u64(sse[0], sse[2]);
+  return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+                        vdupq_n_u64(0) };
+
+  do {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+    highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  sse[0] = vaddq_u64(sse[0], sse[1]);
+  sse[2] = vaddq_u64(sse[2], sse[3]);
+  sse[0] = vaddq_u64(sse[0], sse[2]);
+  return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+                        vdupq_n_u64(0) };
+
+  do {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+    highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]);
+    highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  sse[0] = vaddq_u64(sse[0], sse[1]);
+  sse[2] = vaddq_u64(sse[2], sse[3]);
+  sse[0] = vaddq_u64(sse[0], sse[2]);
+  return vaddvq_u64(sse[0]);
+}
+
+static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride,
+                                          const uint16_t *ref, int ref_stride,
+                                          int height) {
+  uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  do {
+    highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]);
+    highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride,
+                                         const uint16_t *ref, int ref_stride,
+                                         int height) {
+  uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  do {
+    highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]);
+    highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    height -= 2;
+  } while (height != 0);
+
+  return vaddvq_u64(vaddq_u64(sse[0], sse[1]));
+}
+
+static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride,
+                                         const uint16_t *ref, int ref_stride,
+                                         int height) {
+  uint64x2_t sse = vdupq_n_u64(0);
+
+  do {
+    uint16x8_t s = load_unaligned_u16_4x2(src, src_stride);
+    uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride);
+
+    uint16x8_t abs_diff = vabdq_u16(s, r);
+    sse = aom_udotq_u16(sse, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    height -= 2;
+  } while (height != 0);
+
+  return vaddvq_u64(sse);
+}
+
+static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride,
+                                         const uint16_t *ref, int ref_stride,
+                                         int width, int height) {
+  svuint64_t sse = svdup_n_u64(0);
+  uint64_t step = svcnth();
+
+  do {
+    int w = 0;
+    const uint16_t *src_ptr = src;
+    const uint16_t *ref_ptr = ref;
+
+    do {
+      svbool_t pred = svwhilelt_b16_u32(w, width);
+      svuint16_t s = svld1_u16(pred, src_ptr);
+      svuint16_t r = svld1_u16(pred, ref_ptr);
+
+      svuint16_t abs_diff = svabd_u16_z(pred, s, r);
+
+      sse = svdot_u64(sse, abs_diff, abs_diff);
+
+      src_ptr += step;
+      ref_ptr += step;
+      w += step;
+    } while (w < width);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--height != 0);
+
+  return svaddv_u64(svptrue_b64(), sse);
+}
+
+int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride,
+                           const uint8_t *ref8, int ref_stride, int width,
+                           int height) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  switch (width) {
+    case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height);
+    case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height);
+    default:
+      return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width,
+                                height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
new file mode 100644
index 0000000000..686fa5f226
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_subpel_variance_neon.c
@@ -0,0 +1,1497 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// The bilinear filters look like this:
+//
+// {{ 128,  0 }, { 112, 16 }, { 96, 32 }, { 80,  48 },
+//  {  64, 64 }, {  48, 80 }, { 32, 96 }, { 16, 112 }}
+//
+// We can factor out the highest common multiple, such that the sum of both
+// weights will be 8 instead of 128. The benefits of this are two-fold:
+//
+// 1) We can infer the filter values from the filter_offset parameter in the
+// bilinear filter functions below - we don't have to actually load the values
+// from memory:
+// f0 = 8 - filter_offset
+// f1 = filter_offset
+//
+// 2) Scaling the pixel values by 8, instead of 128 enables us to operate on
+// 16-bit data types at all times, rather than widening out to 32-bit and
+// requiring double the number of data processing instructions. (12-bit * 8 =
+// 15-bit.)
+
+// Process a block exactly 4 wide and any height.
+static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, blend);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+  } while (--i != 0);
+}
+
+// Process a block which is a multiple of 8 and any height.
+static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr,
+                                                uint16_t *dst_ptr,
+                                                int src_stride, int pixel_step,
+                                                int dst_width, int dst_height,
+                                                int filter_offset) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, blend);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr,
+                                             uint16_t *dst_ptr, int src_stride,
+                                             int pixel_step, int dst_height,
+                                             int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      8, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      16, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      32, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr,
+                                              uint16_t *dst_ptr, int src_stride,
+                                              int pixel_step, int dst_height,
+                                              int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      64, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr,
+                                               uint16_t *dst_ptr,
+                                               int src_stride, int pixel_step,
+                                               int dst_height,
+                                               int filter_offset) {
+  highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step,
+                                      128, dst_height, filter_offset);
+}
+
+static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr,
+                                          uint16_t *dst_ptr, int src_stride,
+                                          int pixel_step, int dst_width,
+                                          int dst_height) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {                     \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+                                                                               \
+    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)               \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride,           \
+                                           src_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having
+// width 4.
+static void highbd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    vst1_u16(dst_ptr, vrhadd_u16(blend, p));
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks.
+static void highbd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred) {
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p));
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 8, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 16, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 32, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 64, dst_height,
+                                               filter_offset, second_pred);
+}
+
+static void highbd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred) {
+  highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                               pixel_step, 128, dst_height,
+                                               filter_offset, second_pred);
+}
+
+// Combine averaging subpel filter with aom_highbd_comp_avg_pred.
+static void highbd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+
+      uint16x8_t p = vld1q_u16(second_pred);
+      avg = vrhaddq_u16(avg, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16.
+static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                            int src_stride, int dst_width, int dst_height,
+                            const uint16_t *second_pred) {
+  int i = dst_height;
+
+  // We only specialize on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = vrhaddq_u16(s, p);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      j += 8;
+      second_pred += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)                       \
+  uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon(     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_avg_pred_var_filter_block2d_bil_w##w(                               \
+        tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));       \
+                                                                               \
+    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)           \
+  unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred) {                                            \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+                                                                               \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred(src_ptr, tmp, source_stride, w, h,                     \
+                        CONVERT_TO_SHORTPTR(second_pred));                     \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else if (yoffset == 4) {                                               \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp, source_stride, source_stride, w, h,                  \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      } else {                                                                 \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp, source_stride, source_stride, h, yoffset,            \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse);                 \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            src_ptr, tmp0, source_stride, 1, w, h,                             \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w,      \
+                                      (h + 1));                                \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            src_ptr, tmp0, source_stride, 1, h, xoffset,                       \
+            CONVERT_TO_SHORTPTR(second_pred));                                 \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_avg(                                \
+            tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred));         \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1,    \
+                                           (h + 1), xoffset);                  \
+        highbd_avg_pred_var_filter_block2d_bil_w##w(                           \
+            tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred));   \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                    \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
+          const uint8_t *msk, int msk_stride, int invert_mask,                 \
+          unsigned int *sse) {                                                 \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * (h + 1)];                                                \
+    uint16_t tmp2[w * h];                                                      \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+    highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1),  \
+                                       xoffset);                               \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);          \
+    aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w,   \
+                                   h, CONVERT_TO_BYTEPTR(tmp1), w, msk,        \
+                                   msk_stride, invert_mask);                   \
+    return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \
+                                                     w, ref, ref_stride, sse); \
+  }
+
+#define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)        \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon(       \
+          const uint8_t *src, int src_stride, int xoffset, int yoffset,        \
+          const uint8_t *ref, int ref_stride, const uint8_t *second_pred,      \
+          const uint8_t *msk, int msk_stride, int invert_mask,                 \
+          unsigned int *sse) {                                                 \
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);                              \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp0[w * h];                                                    \
+      if (yoffset == 0) {                                                      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred,  \
+                                       w, h, src, src_stride, msk, msk_stride, \
+                                       invert_mask);                           \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride,   \
+                                      w, h);                                   \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride,          \
+                                           src_stride, h, yoffset);            \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h);     \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w,         \
+                                      (h + 1));                                \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    } else {                                                                   \
+      if (yoffset == 0) {                                                      \
+        uint16_t tmp0[w * h];                                                  \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h,    \
+                                           xoffset);                           \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse);                \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * h];                                                  \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      } else {                                                                 \
+        uint16_t tmp0[w * (h + 1)];                                            \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        uint16_t tmp2[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1,       \
+                                           (h + 1), xoffset);                  \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred,  \
+                                       w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \
+                                       msk_stride, invert_mask);               \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse);                \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+
+HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if !CONFIG_REALTIME_ONLY
+#define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)                \
+  unsigned int                                                              \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(      \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,     \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {    \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                           \
+    uint16_t tmp0[w * (h + 1)];                                             \
+    uint16_t tmp1[w * h];                                                   \
+    highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \
+                                       xoffset);                            \
+    highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);       \
+    return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(           \
+        CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                      \
+  }
+
+#define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h)       \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon(         \
+          const uint8_t *pre, int pre_stride, int xoffset, int yoffset,        \
+          const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {       \
+    uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre);                              \
+    if (xoffset == 0) {                                                        \
+      if (yoffset == 0) {                                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            pre, pre_stride, wsrc, mask, sse);                                 \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \
+                                      h);                                      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      } else {                                                                 \
+        uint16_t tmp[w * h];                                                   \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride,           \
+                                           pre_stride, h, yoffset);            \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse);                      \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h);     \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h,    \
+                                           xoffset);                           \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse);                     \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                 \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1,       \
+                                           h + 1, xoffset);                    \
+        highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);      \
+        return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon(          \
+            CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse);                     \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128)
+
+// 10-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128)
+
+// 12-bit
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32)
+
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4)
+HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128)
+#endif  // !CONFIG_REALTIME_ONLY
+
+static void highbd_dist_wtd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr,
+                                     int src_stride, int dst_width,
+                                     int dst_height,
+                                     const uint16_t *second_pred,
+                                     const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(s, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_avg(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+      uint16x8_t avg = vrhaddq_u16(s0, s1);
+      avg = dist_wtd_avg_u16x8(avg, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x4_t fwd_offset = vdup_n_u16(jcp_param->fwd_offset);
+  const uint16x4_t bck_offset = vdup_n_u16(jcp_param->bck_offset);
+  const uint16x4_t f0 = vdup_n_u16(8 - filter_offset);
+  const uint16x4_t f1 = vdup_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr);
+    uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step);
+    uint16x4_t p = vld1_u16(second_pred);
+
+    uint16x4_t blend = vmul_u16(s0, f0);
+    blend = vmla_u16(blend, s1, f1);
+    blend = vrshr_n_u16(blend, 3);
+
+    uint16x4_t avg = dist_wtd_avg_u16x4(blend, p, fwd_offset, bck_offset);
+
+    vst1_u16(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 4;
+    second_pred += 4;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint16_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint16x8_t fwd_offset = vdupq_n_u16(jcp_param->fwd_offset);
+  const uint16x8_t bck_offset = vdupq_n_u16(jcp_param->bck_offset);
+  const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset);
+  const uint16x8_t f1 = vdupq_n_u16(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr + j);
+      uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step);
+      uint16x8_t p = vld1q_u16(second_pred);
+
+      uint16x8_t blend = vmulq_u16(s0, f0);
+      blend = vmlaq_u16(blend, s1, f1);
+      blend = vrshrq_n_u16(blend, 3);
+
+      uint16x8_t avg = dist_wtd_avg_u16x8(blend, p, fwd_offset, bck_offset);
+
+      vst1q_u16(dst_ptr + j, avg);
+
+      second_pred += 8;
+      j += 8;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 8, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+    const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint16_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  highbd_dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+#define HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)              \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    uint16_t tmp0[w * (h + 1)];                                                \
+    uint16_t tmp1[w * h];                                                      \
+    highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1,     \
+                                       xoffset);                               \
+    highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                      \
+        tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                      \
+    return aom_highbd_##bitdepth##_variance##w##x##h(                          \
+        CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);                \
+  }
+
+#define SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h)  \
+  unsigned int                                                                 \
+      aom_highbd_##bitdepth##_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon( \
+          const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, \
+          const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,               \
+          const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                              \
+    uint16_t *second = CONVERT_TO_SHORTPTR(second_pred);                       \
+    if (xoffset == 0) {                                                        \
+      uint16_t tmp[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred(src, tmp, source_stride, w, h, second,        \
+                                 jcp_param);                                   \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else if (yoffset == 4) {                                               \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp, source_stride, source_stride, w, h, second, jcp_param);  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      } else {                                                                 \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second,        \
+            jcp_param);                                                        \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp), w, ref_ptr, ref_stride, sse);             \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(                       \
+            src, tmp0, source_stride, 1, w, h, second, jcp_param);             \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * (h + 1)];                                            \
+        highbd_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h + 1);  \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    } else {                                                                   \
+      uint16_t tmp0[w * (h + 1)];                                              \
+      if (yoffset == 0) {                                                      \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            src, tmp0, source_stride, 1, h, xoffset, second, jcp_param);       \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp0), w, ref_ptr, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w,   \
+                                                        h, second, jcp_param); \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      } else {                                                                 \
+        uint16_t tmp1[w * h];                                                  \
+        highbd_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h + 1, \
+                                           xoffset);                           \
+        highbd_dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                  \
+            tmp0, tmp1, w, w, h, yoffset, second, jcp_param);                  \
+        return aom_highbd_##bitdepth##_variance##w##x##h(                      \
+            CONVERT_TO_BYTEPTR(tmp1), w, ref_ptr, ref_stride, sse);            \
+      }                                                                        \
+    }                                                                          \
+  }
+
+// 8-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 10-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+// 12-bit
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8)
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16)
+
+HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4)
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8)
+
+SPECIALIZED_HBD_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
new file mode 100644
index 0000000000..18b8efff4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon.c
@@ -0,0 +1,502 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_neon(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  int i = h;
+  do {
+    const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = horizontal_add_s32x4(sse_s32);
+}
+
+// For 8-bit and 10-bit data, since we're using two int32x4 accumulators, all
+// block sizes can be processed in 32-bit elements (1023*1023*128*32 =
+// 4286582784 for a 128x128 block).
+static INLINE void highbd_variance_large_neon(const uint16_t *src_ptr,
+                                              int src_stride,
+                                              const uint16_t *ref_ptr,
+                                              int ref_stride, int w, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint16x8_t s = vld1q_u16(src_ptr + j);
+      const uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+      sum_s32 = vpadalq_s16(sum_s32, diff);
+
+      sse_s32[0] =
+          vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+      sse_s32[1] =
+          vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_s32x4(sum_s32);
+  *sse = horizontal_long_add_u32x4(vaddq_u32(
+      vreinterpretq_u32_s32(sse_s32[0]), vreinterpretq_u32_s32(sse_s32[1])));
+}
+
+static INLINE void highbd_variance_8xh_neon(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 8, h, sse, sum);
+}
+
+static INLINE void highbd_variance_16xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 16, h, sse, sum);
+}
+
+static INLINE void highbd_variance_32xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_neon(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_neon(const uint16_t *src,
+                                              int src_stride,
+                                              const uint16_t *ref,
+                                              int ref_stride, int h,
+                                              uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_neon(src, src_stride, ref, ref_stride, 128, h, sse,
+                             sum);
+}
+
+// For 12-bit data, we can only accumulate up to 128 elements in the sum of
+// squares (4095*4095*128 = 2146435200), and because we're using two int32x4
+// accumulators, we can only process up to 32 32-element rows (32*32/8 = 128)
+// or 16 64-element rows before we have to accumulate into 64-bit elements.
+// Therefore blocks of size 32x64, 64x32, 64x64, 64x128, 128x64, 128x128 are
+// processed in a different helper function.
+
+// Process a block of any size where the width is divisible by 8, with
+// accumulation into 64-bit elements.
+static INLINE void highbd_variance_xlarge_neon(
+    const uint16_t *src_ptr, int src_stride, const uint16_t *ref_ptr,
+    int ref_stride, int w, int h, int h_limit, uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 32-bit
+  // accumulator overflows. After hitting this limit we accumulate into 64-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    do {
+      int j = 0;
+      do {
+        const uint16x8_t s0 = vld1q_u16(src_ptr + j);
+        const uint16x8_t r0 = vld1q_u16(ref_ptr + j);
+
+        const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s0, r0));
+        sum_s32 = vpadalq_s16(sum_s32, diff);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+        j += 8;
+      } while (j < w);
+
+      src_ptr += src_stride;
+      ref_ptr += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[0]);
+    sse_s64 = vpadalq_s32(sse_s64, sse_s32[1]);
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_s32x4(sum_s32);
+  *sse = (uint64_t)horizontal_add_s64x2(sse_s64);
+}
+
+static INLINE void highbd_variance_32xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 32, h, 32, sse,
+                              sum);
+}
+
+static INLINE void highbd_variance_64xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 64, h, 16, sse,
+                              sum);
+}
+
+static INLINE void highbd_variance_128xh_xlarge_neon(
+    const uint16_t *src, int src_stride, const uint16_t *ref, int ref_stride,
+    int h, uint64_t *sse, int64_t *sum) {
+  highbd_variance_xlarge_neon(src, src_stride, ref, ref_stride, 128, h, 8, sse,
+                              sum);
+}
+
+#define HBD_VARIANCE_WXH_8_NEON(w, h)                                 \
+  uint32_t aom_highbd_8_variance##w##x##h##_neon(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }
+
+#define HBD_VARIANCE_WXH_10_NEON(w, h)                                \
+  uint32_t aom_highbd_10_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_NEON(w, h)                                \
+  uint32_t aom_highbd_12_variance##w##x##h##_neon(                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, \
+                                 &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_XLARGE_NEON(w, h)                                \
+  uint32_t aom_highbd_12_variance##w##x##h##_neon(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride, uint32_t *sse) {                                       \
+    int sum;                                                                 \
+    int64_t var;                                                             \
+    uint64_t sse_long = 0;                                                   \
+    int64_t sum_long = 0;                                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                            \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                            \
+    highbd_variance_##w##xh_xlarge_neon(src, src_stride, ref, ref_stride, h, \
+                                        &sse_long, &sum_long);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                        \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                              \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                   \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 4)
+HBD_VARIANCE_WXH_8_NEON(4, 8)
+
+HBD_VARIANCE_WXH_8_NEON(8, 4)
+HBD_VARIANCE_WXH_8_NEON(8, 8)
+HBD_VARIANCE_WXH_8_NEON(8, 16)
+
+HBD_VARIANCE_WXH_8_NEON(16, 8)
+HBD_VARIANCE_WXH_8_NEON(16, 16)
+HBD_VARIANCE_WXH_8_NEON(16, 32)
+
+HBD_VARIANCE_WXH_8_NEON(32, 16)
+HBD_VARIANCE_WXH_8_NEON(32, 32)
+HBD_VARIANCE_WXH_8_NEON(32, 64)
+
+HBD_VARIANCE_WXH_8_NEON(64, 32)
+HBD_VARIANCE_WXH_8_NEON(64, 64)
+HBD_VARIANCE_WXH_8_NEON(64, 128)
+
+HBD_VARIANCE_WXH_8_NEON(128, 64)
+HBD_VARIANCE_WXH_8_NEON(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 4)
+HBD_VARIANCE_WXH_10_NEON(4, 8)
+
+HBD_VARIANCE_WXH_10_NEON(8, 4)
+HBD_VARIANCE_WXH_10_NEON(8, 8)
+HBD_VARIANCE_WXH_10_NEON(8, 16)
+
+HBD_VARIANCE_WXH_10_NEON(16, 8)
+HBD_VARIANCE_WXH_10_NEON(16, 16)
+HBD_VARIANCE_WXH_10_NEON(16, 32)
+
+HBD_VARIANCE_WXH_10_NEON(32, 16)
+HBD_VARIANCE_WXH_10_NEON(32, 32)
+HBD_VARIANCE_WXH_10_NEON(32, 64)
+
+HBD_VARIANCE_WXH_10_NEON(64, 32)
+HBD_VARIANCE_WXH_10_NEON(64, 64)
+HBD_VARIANCE_WXH_10_NEON(64, 128)
+
+HBD_VARIANCE_WXH_10_NEON(128, 64)
+HBD_VARIANCE_WXH_10_NEON(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 4)
+HBD_VARIANCE_WXH_12_NEON(4, 8)
+
+HBD_VARIANCE_WXH_12_NEON(8, 4)
+HBD_VARIANCE_WXH_12_NEON(8, 8)
+HBD_VARIANCE_WXH_12_NEON(8, 16)
+
+HBD_VARIANCE_WXH_12_NEON(16, 8)
+HBD_VARIANCE_WXH_12_NEON(16, 16)
+HBD_VARIANCE_WXH_12_NEON(16, 32)
+
+HBD_VARIANCE_WXH_12_NEON(32, 16)
+HBD_VARIANCE_WXH_12_NEON(32, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(32, 64)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 32)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(64, 128)
+
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 64)
+HBD_VARIANCE_WXH_12_XLARGE_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_NEON(4, 16)
+
+HBD_VARIANCE_WXH_8_NEON(8, 32)
+
+HBD_VARIANCE_WXH_8_NEON(16, 4)
+HBD_VARIANCE_WXH_8_NEON(16, 64)
+
+HBD_VARIANCE_WXH_8_NEON(32, 8)
+
+HBD_VARIANCE_WXH_8_NEON(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_NEON(4, 16)
+
+HBD_VARIANCE_WXH_10_NEON(8, 32)
+
+HBD_VARIANCE_WXH_10_NEON(16, 4)
+HBD_VARIANCE_WXH_10_NEON(16, 64)
+
+HBD_VARIANCE_WXH_10_NEON(32, 8)
+
+HBD_VARIANCE_WXH_10_NEON(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_NEON(4, 16)
+
+HBD_VARIANCE_WXH_12_NEON(8, 32)
+
+HBD_VARIANCE_WXH_12_NEON(16, 4)
+HBD_VARIANCE_WXH_12_NEON(16, 64)
+
+HBD_VARIANCE_WXH_12_NEON(32, 8)
+
+HBD_VARIANCE_WXH_12_NEON(64, 16)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+static INLINE uint32_t highbd_mse_wxh_neon(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int w, int h,
+                                           unsigned int *sse) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u32[0] =
+          vmlal_u16(sse_u32[0], vget_low_u16(diff), vget_low_u16(diff));
+      sse_u32[1] =
+          vmlal_u16(sse_u32[1], vget_high_u16(diff), vget_high_u16(diff));
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON(w, h)                                     \
+  uint32_t aom_highbd_8_mse##w##x##h##_neon(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t aom_highbd_10_mse##w##x##h##_neon(                         \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                               \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t aom_highbd_12_mse##w##x##h##_neon(                         \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                               \
+    return *sse;                                                      \
+  }
+
+HIGHBD_MSE_WXH_NEON(16, 16)
+HIGHBD_MSE_WXH_NEON(16, 8)
+HIGHBD_MSE_WXH_NEON(8, 16)
+HIGHBD_MSE_WXH_NEON(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_8x2(uint64x2_t sum, uint16x8_t s0,
+                                                uint16x8_t s1, uint16x8_t d0,
+                                                uint16x8_t d1) {
+  uint16x8_t e0 = vabdq_u16(s0, d0);
+  uint16x8_t e1 = vabdq_u16(s1, d1);
+
+  uint32x4_t mse = vmull_u16(vget_low_u16(e0), vget_low_u16(e0));
+  mse = vmlal_u16(mse, vget_high_u16(e0), vget_high_u16(e0));
+  mse = vmlal_u16(mse, vget_low_u16(e1), vget_low_u16(e1));
+  mse = vmlal_u16(mse, vget_high_u16(e1), vget_high_u16(e1));
+
+  return vpadalq_u32(sum, mse);
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_neon(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  if (w == 8) {
+    do {
+      uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+      uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {  // w == 4
+    do {
+      uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+      uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      sum = mse_accumulate_u16_8x2(sum, s0, s1, d0, d1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
new file mode 100644
index 0000000000..d56ae97571
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_neon_dotprod.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint32_t highbd_mse8_8xh_neon_dotprod(const uint16_t *src_ptr,
+                                                    int src_stride,
+                                                    const uint16_t *ref_ptr,
+                                                    int ref_stride, int h,
+                                                    unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h / 2;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t s1 = vld1q_u16(src_ptr);
+    src_ptr += src_stride;
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+    uint16x8_t r1 = vld1q_u16(ref_ptr);
+    ref_ptr += ref_stride;
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+static INLINE uint32_t highbd_mse8_16xh_neon_dotprod(const uint16_t *src_ptr,
+                                                     int src_stride,
+                                                     const uint16_t *ref_ptr,
+                                                     int ref_stride, int h,
+                                                     unsigned int *sse) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint16x8_t s0 = vld1q_u16(src_ptr);
+    uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+    uint16x8_t r0 = vld1q_u16(ref_ptr);
+    uint16x8_t r1 = vld1q_u16(ref_ptr + 8);
+
+    uint8x16_t s = vcombine_u8(vmovn_u16(s0), vmovn_u16(s1));
+    uint8x16_t r = vcombine_u8(vmovn_u16(r0), vmovn_u16(r1));
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, diff, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return *sse;
+}
+
+#define HIGHBD_MSE_WXH_NEON_DOTPROD(w, h)                                 \
+  uint32_t aom_highbd_8_mse##w##x##h##_neon_dotprod(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,     \
+      int ref_stride, uint32_t *sse) {                                    \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                         \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                         \
+    highbd_mse8_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, \
+                                     sse);                                \
+    return *sse;                                                          \
+  }
+
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(16, 8)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 16)
+HIGHBD_MSE_WXH_NEON_DOTPROD(8, 8)
+
+#undef HIGHBD_MSE_WXH_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/highbd_variance_sve.c b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
new file mode 100644
index 0000000000..d0058bfa90
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/highbd_variance_sve.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/variance.h"
+
+// Process a block of width 4 two rows at a time.
+static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride);
+    const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride);
+
+    int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s64 = aom_sdotq_s16(sse_s64, diff, diff);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    h -= 2;
+  } while (h != 0);
+
+  *sum = vaddlvq_s16(sum_s16);
+  *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref,
+                                    int32x4_t *sum, int64x2_t *sse) {
+  const uint16x8_t s = vld1q_u16(src);
+  const uint16x8_t r = vld1q_u16(ref);
+
+  const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r));
+  *sum = vpadalq_s16(*sum, diff);
+
+  *sse = aom_sdotq_s16(*sse, diff, diff);
+}
+
+static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr,
+                                           int src_stride,
+                                           const uint16_t *ref_ptr,
+                                           int ref_stride, int h, uint64_t *sse,
+                                           int64_t *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int64x2_t sse_s64 = vdupq_n_s64(0);
+
+  do {
+    variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sum = vaddlvq_s32(sum_s32);
+  *sse = vaddvq_s64(sse_s64);
+}
+
+static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr,
+                                            int src_stride,
+                                            const uint16_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]);
+    variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1]));
+  *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1]));
+}
+
+static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr,
+                                             int src_stride,
+                                             const uint16_t *ref_ptr,
+                                             int ref_stride, int w, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+  int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                           vdupq_n_s64(0) };
+
+  do {
+    int j = 0;
+    do {
+      variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]);
+      variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1],
+                       &sse_s64[1]);
+      variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2],
+                       &sse_s64[2]);
+      variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3],
+                       &sse_s64[3]);
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]);
+  sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]);
+  *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2]));
+  sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]);
+  sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]);
+  *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2]));
+}
+
+static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum);
+}
+
+static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride,
+                                            const uint16_t *ref, int ref_stride,
+                                            int h, uint64_t *sse,
+                                            int64_t *sum) {
+  highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum);
+}
+
+static INLINE void highbd_variance_128xh_sve(const uint16_t *src,
+                                             int src_stride,
+                                             const uint16_t *ref,
+                                             int ref_stride, int h,
+                                             uint64_t *sse, int64_t *sum) {
+  highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum);
+}
+
+#define HBD_VARIANCE_WXH_8_SVE(w, h)                                  \
+  uint32_t aom_highbd_8_variance##w##x##h##_sve(                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)sse_long;                                        \
+    sum = (int)sum_long;                                              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h));         \
+  }
+
+#define HBD_VARIANCE_WXH_10_SVE(w, h)                                 \
+  uint32_t aom_highbd_10_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+#define HBD_VARIANCE_WXH_12_SVE(w, h)                                 \
+  uint32_t aom_highbd_12_variance##w##x##h##_sve(                     \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    int sum;                                                          \
+    int64_t var;                                                      \
+    uint64_t sse_long = 0;                                            \
+    int64_t sum_long = 0;                                             \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h,  \
+                                &sse_long, &sum_long);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);                 \
+    sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h));         \
+    return (var >= 0) ? (uint32_t)var : 0;                            \
+  }
+
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 4)
+HBD_VARIANCE_WXH_8_SVE(4, 8)
+
+HBD_VARIANCE_WXH_8_SVE(8, 4)
+HBD_VARIANCE_WXH_8_SVE(8, 8)
+HBD_VARIANCE_WXH_8_SVE(8, 16)
+
+HBD_VARIANCE_WXH_8_SVE(16, 8)
+HBD_VARIANCE_WXH_8_SVE(16, 16)
+HBD_VARIANCE_WXH_8_SVE(16, 32)
+
+HBD_VARIANCE_WXH_8_SVE(32, 16)
+HBD_VARIANCE_WXH_8_SVE(32, 32)
+HBD_VARIANCE_WXH_8_SVE(32, 64)
+
+HBD_VARIANCE_WXH_8_SVE(64, 32)
+HBD_VARIANCE_WXH_8_SVE(64, 64)
+HBD_VARIANCE_WXH_8_SVE(64, 128)
+
+HBD_VARIANCE_WXH_8_SVE(128, 64)
+HBD_VARIANCE_WXH_8_SVE(128, 128)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 4)
+HBD_VARIANCE_WXH_10_SVE(4, 8)
+
+HBD_VARIANCE_WXH_10_SVE(8, 4)
+HBD_VARIANCE_WXH_10_SVE(8, 8)
+HBD_VARIANCE_WXH_10_SVE(8, 16)
+
+HBD_VARIANCE_WXH_10_SVE(16, 8)
+HBD_VARIANCE_WXH_10_SVE(16, 16)
+HBD_VARIANCE_WXH_10_SVE(16, 32)
+
+HBD_VARIANCE_WXH_10_SVE(32, 16)
+HBD_VARIANCE_WXH_10_SVE(32, 32)
+HBD_VARIANCE_WXH_10_SVE(32, 64)
+
+HBD_VARIANCE_WXH_10_SVE(64, 32)
+HBD_VARIANCE_WXH_10_SVE(64, 64)
+HBD_VARIANCE_WXH_10_SVE(64, 128)
+
+HBD_VARIANCE_WXH_10_SVE(128, 64)
+HBD_VARIANCE_WXH_10_SVE(128, 128)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 4)
+HBD_VARIANCE_WXH_12_SVE(4, 8)
+
+HBD_VARIANCE_WXH_12_SVE(8, 4)
+HBD_VARIANCE_WXH_12_SVE(8, 8)
+HBD_VARIANCE_WXH_12_SVE(8, 16)
+
+HBD_VARIANCE_WXH_12_SVE(16, 8)
+HBD_VARIANCE_WXH_12_SVE(16, 16)
+HBD_VARIANCE_WXH_12_SVE(16, 32)
+
+HBD_VARIANCE_WXH_12_SVE(32, 16)
+HBD_VARIANCE_WXH_12_SVE(32, 32)
+HBD_VARIANCE_WXH_12_SVE(32, 64)
+
+HBD_VARIANCE_WXH_12_SVE(64, 32)
+HBD_VARIANCE_WXH_12_SVE(64, 64)
+HBD_VARIANCE_WXH_12_SVE(64, 128)
+
+HBD_VARIANCE_WXH_12_SVE(128, 64)
+HBD_VARIANCE_WXH_12_SVE(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+// 8-bit
+HBD_VARIANCE_WXH_8_SVE(4, 16)
+
+HBD_VARIANCE_WXH_8_SVE(8, 32)
+
+HBD_VARIANCE_WXH_8_SVE(16, 4)
+HBD_VARIANCE_WXH_8_SVE(16, 64)
+
+HBD_VARIANCE_WXH_8_SVE(32, 8)
+
+HBD_VARIANCE_WXH_8_SVE(64, 16)
+
+// 10-bit
+HBD_VARIANCE_WXH_10_SVE(4, 16)
+
+HBD_VARIANCE_WXH_10_SVE(8, 32)
+
+HBD_VARIANCE_WXH_10_SVE(16, 4)
+HBD_VARIANCE_WXH_10_SVE(16, 64)
+
+HBD_VARIANCE_WXH_10_SVE(32, 8)
+
+HBD_VARIANCE_WXH_10_SVE(64, 16)
+
+// 12-bit
+HBD_VARIANCE_WXH_12_SVE(4, 16)
+
+HBD_VARIANCE_WXH_12_SVE(8, 32)
+
+HBD_VARIANCE_WXH_12_SVE(16, 4)
+HBD_VARIANCE_WXH_12_SVE(16, 64)
+
+HBD_VARIANCE_WXH_12_SVE(32, 8)
+
+HBD_VARIANCE_WXH_12_SVE(64, 16)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef HBD_VARIANCE_WXH_8_SVE
+#undef HBD_VARIANCE_WXH_10_SVE
+#undef HBD_VARIANCE_WXH_12_SVE
+
+static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr,
+                                          int src_stride,
+                                          const uint16_t *ref_ptr,
+                                          int ref_stride, int w, int h,
+                                          unsigned int *sse) {
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  do {
+    int j = 0;
+    do {
+      uint16x8_t s = vld1q_u16(src_ptr + j);
+      uint16x8_t r = vld1q_u16(ref_ptr + j);
+
+      uint16x8_t diff = vabdq_u16(s, r);
+
+      sse_u64 = aom_udotq_u16(sse_u64, diff, diff);
+
+      j += 8;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--h != 0);
+
+  *sse = (uint32_t)vaddvq_u64(sse_u64);
+  return *sse;
+}
+
+#define HIGHBD_MSE_WXH_SVE(w, h)                                      \
+  uint32_t aom_highbd_8_mse##w##x##h##_sve(                           \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse);  \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t aom_highbd_10_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse);  \
+    *sse = ROUND_POWER_OF_TWO(*sse, 4);                               \
+    return *sse;                                                      \
+  }                                                                   \
+                                                                      \
+  uint32_t aom_highbd_12_mse##w##x##h##_sve(                          \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+      int ref_stride, uint32_t *sse) {                                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr);                     \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr);                     \
+    highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse);  \
+    *sse = ROUND_POWER_OF_TWO(*sse, 8);                               \
+    return *sse;                                                      \
+  }
+
+HIGHBD_MSE_WXH_SVE(16, 16)
+HIGHBD_MSE_WXH_SVE(16, 8)
+HIGHBD_MSE_WXH_SVE(8, 16)
+HIGHBD_MSE_WXH_SVE(8, 8)
+
+#undef HIGHBD_MSE_WXH_SVE
+
+uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src,
+                                      int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  if (w == 8) {
+    do {
+      uint16x8_t d0 = vld1q_u16(dst + 0 * dstride);
+      uint16x8_t d1 = vld1q_u16(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+      uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+      sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+      sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {  // w == 4
+    do {
+      uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride);
+      uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      uint16x8_t abs_diff0 = vabdq_u16(s0, d0);
+      uint16x8_t abs_diff1 = vabdq_u16(s1, d1);
+
+      sum = aom_udotq_u16(sum, abs_diff0, abs_diff0);
+      sum = aom_udotq_u16(sum, abs_diff1, abs_diff1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return vaddvq_u64(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/intrapred_neon.c b/third_party/aom/aom_dsp/arm/intrapred_neon.c
new file mode 100644
index 0000000000..d8dc60c1fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/intrapred_neon.c
@@ -0,0 +1,3110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/reinterpret_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+#include "aom_dsp/intrapred_common.h"
+
+//------------------------------------------------------------------------------
+// DC 4x4
+
+static INLINE uint16x8_t dc_load_sum_4(const uint8_t *in) {
+  const uint8x8_t a = load_u8_4x1(in);
+  const uint16x4_t p0 = vpaddl_u8(a);
+  const uint16x4_t p1 = vpadd_u16(p0, p0);
+  return vcombine_u16(p1, vdup_n_u16(0));
+}
+
+static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    store_u8_4x1(dst + i * stride, dc);
+  }
+}
+
+void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_4(above);
+  const uint16x8_t sum_left = dc_load_sum_4(left);
+  const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_left = dc_load_sum_4(left);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2);
+  (void)above;
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_4(above);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2);
+  (void)left;
+  dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc0 = vdup_n_u8(0x80);
+  (void)above;
+  (void)left;
+  dc_store_4xh(dst, stride, 4, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 8x8
+
+static INLINE uint16x8_t dc_load_sum_8(const uint8_t *in) {
+  // This isn't used in the case where we want to load both above and left
+  // vectors, since we want to avoid performing the reduction twice.
+  const uint8x8_t a = vld1_u8(in);
+  const uint16x4_t p0 = vpaddl_u8(a);
+  const uint16x4_t p1 = vpadd_u16(p0, p0);
+  const uint16x4_t p2 = vpadd_u16(p1, p1);
+  return vcombine_u16(p2, vdup_n_u16(0));
+}
+
+static INLINE uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+  // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an
+  // instruction, however the addv instruction is usually slightly more
+  // expensive than a pairwise addition, so the need for immediately
+  // broadcasting the result again seems to negate any benefit.
+  const uint16x8_t b = vpaddq_u16(a, a);
+  const uint16x8_t c = vpaddq_u16(b, b);
+  return vpaddq_u16(c, c);
+#else
+  const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a));
+  const uint16x4_t c = vpadd_u16(b, b);
+  const uint16x4_t d = vpadd_u16(c, c);
+  return vcombine_u16(d, d);
+#endif
+}
+
+static INLINE void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x8_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1_u8(dst + i * stride, dc);
+  }
+}
+
+void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t sum_top = vld1_u8(above);
+  const uint8x8_t sum_left = vld1_u8(left);
+  uint16x8_t sum = vaddl_u8(sum_left, sum_top);
+  sum = horizontal_add_and_broadcast_u16x8(sum);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 4);
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_left = dc_load_sum_8(left);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3);
+  (void)above;
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_8(above);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3);
+  (void)left;
+  dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t dc0 = vdup_n_u8(0x80);
+  (void)above;
+  (void)left;
+  dc_store_8xh(dst, stride, 8, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 16x16
+
+static INLINE uint16x8_t dc_load_partial_sum_16(const uint8_t *in) {
+  const uint8x16_t a = vld1q_u8(in);
+  // delay the remainder of the reduction until
+  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+  // than twice in the case we are loading both above and left.
+  return vpaddlq_u8(a);
+}
+
+static INLINE uint16x8_t dc_load_sum_16(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in));
+}
+
+static INLINE void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + i * stride, dc);
+  }
+}
+
+void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_partial_sum_16(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_16(left);
+  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+  sum = horizontal_add_and_broadcast_u16x8(sum);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 5);
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint16x8_t sum_left = dc_load_sum_16(left);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4);
+  (void)above;
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_16(above);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4);
+  (void)left;
+  dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 16, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 32x32
+
+static INLINE uint16x8_t dc_load_partial_sum_32(const uint8_t *in) {
+  const uint8x16_t a0 = vld1q_u8(in);
+  const uint8x16_t a1 = vld1q_u8(in + 16);
+  // delay the remainder of the reduction until
+  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+  // than twice in the case we are loading both above and left.
+  return vpadalq_u8(vpaddlq_u8(a0), a1);
+}
+
+static INLINE uint16x8_t dc_load_sum_32(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in));
+}
+
+static INLINE void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + i * stride, dc);
+    vst1q_u8(dst + i * stride + 16, dc);
+  }
+}
+
+void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_partial_sum_32(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+  sum = horizontal_add_and_broadcast_u16x8(sum);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 6);
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint16x8_t sum_left = dc_load_sum_32(left);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5);
+  (void)above;
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_32(above);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5);
+  (void)left;
+  dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 32, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC 64x64
+
+static INLINE uint16x8_t dc_load_partial_sum_64(const uint8_t *in) {
+  const uint8x16_t a0 = vld1q_u8(in);
+  const uint8x16_t a1 = vld1q_u8(in + 16);
+  const uint8x16_t a2 = vld1q_u8(in + 32);
+  const uint8x16_t a3 = vld1q_u8(in + 48);
+  const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1);
+  const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3);
+  // delay the remainder of the reduction until
+  // horizontal_add_and_broadcast_u16x8, since we want to do it once rather
+  // than twice in the case we are loading both above and left.
+  return vaddq_u16(p01, p23);
+}
+
+static INLINE uint16x8_t dc_load_sum_64(const uint8_t *in) {
+  return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in));
+}
+
+static INLINE void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                 uint8x16_t dc) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + i * stride, dc);
+    vst1q_u8(dst + i * stride + 16, dc);
+    vst1q_u8(dst + i * stride + 32, dc);
+    vst1q_u8(dst + i * stride + 48, dc);
+  }
+}
+
+void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_partial_sum_64(above);
+  const uint16x8_t sum_left = dc_load_partial_sum_64(left);
+  uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+  sum = horizontal_add_and_broadcast_u16x8(sum);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 7);
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint16x8_t sum_left = dc_load_sum_64(left);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6);
+  (void)above;
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint16x8_t sum_top = dc_load_sum_64(above);
+  const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6);
+  (void)left;
+  dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0));
+}
+
+void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const uint8x16_t dc0 = vdupq_n_u8(0x80);
+  (void)above;
+  (void)left;
+  dc_store_64xh(dst, stride, 64, dc0);
+}
+
+//------------------------------------------------------------------------------
+// DC rectangular cases
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+static INLINE int calculate_dc_from_sum(int bw, int bh, uint32_t sum,
+                                        int shift1, int multiplier) {
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+  return expected_dc;
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = load_u8_4x1(above);
+  uint8x8_t l = vld1_u8(left);
+  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+  uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2);
+  dc_store_4xh(dst, stride, 8, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint8x8_t l = load_u8_4x1(left);
+  uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l));
+  uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2);
+  dc_store_8xh(dst, stride, 4, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = load_u8_4x1(above);
+  uint8x16_t l = vld1q_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4);
+  dc_store_4xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a = vld1q_u8(above);
+  uint8x8_t l = load_u8_4x1(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4);
+  dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint8x16_t l = vld1q_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2);
+  dc_store_8xh(dst, stride, 16, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x16_t a = vld1q_u8(above);
+  uint8x8_t l = vld1_u8(left);
+  uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2);
+  dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint8x8_t a = vld1_u8(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddw_u8(sum_left, a);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4);
+  dc_store_8xh(dst, stride, 32, vdup_n_u8(dc));
+}
+
+void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_top = dc_load_partial_sum_32(above);
+  uint8x8_t l = vld1_u8(left);
+  uint16x8_t sum_al = vaddw_u8(sum_top, l);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4);
+  dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_16(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2);
+  dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_32(above);
+  uint16x8_t sum_left = dc_load_partial_sum_16(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2);
+  dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_16(above);
+  uint16x8_t sum_left = dc_load_partial_sum_64(left);
+  uint16x8_t sum_al = vaddq_u16(sum_left, sum_above);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4);
+  dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_64(above);
+  uint16x8_t sum_left = dc_load_partial_sum_16(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4);
+  dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_32(above);
+  uint16x8_t sum_left = dc_load_partial_sum_64(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2);
+  dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc));
+}
+
+void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  uint16x8_t sum_above = dc_load_partial_sum_64(above);
+  uint16x8_t sum_left = dc_load_partial_sum_32(left);
+  uint16x8_t sum_al = vaddq_u16(sum_above, sum_left);
+  uint32_t sum = horizontal_add_u16x8(sum_al);
+  uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2);
+  dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc));
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+#define DC_PREDICTOR_128(w, h, q)                                            \
+  void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                             const uint8_t *above,           \
+                                             const uint8_t *left) {          \
+    (void)above;                                                             \
+    (void)left;                                                              \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80));                \
+  }
+
+DC_PREDICTOR_128(4, 8, )
+DC_PREDICTOR_128(4, 16, )
+DC_PREDICTOR_128(8, 4, )
+DC_PREDICTOR_128(8, 16, )
+DC_PREDICTOR_128(8, 32, )
+DC_PREDICTOR_128(16, 4, q)
+DC_PREDICTOR_128(16, 8, q)
+DC_PREDICTOR_128(16, 32, q)
+DC_PREDICTOR_128(16, 64, q)
+DC_PREDICTOR_128(32, 8, q)
+DC_PREDICTOR_128(32, 16, q)
+DC_PREDICTOR_128(32, 64, q)
+DC_PREDICTOR_128(64, 32, q)
+DC_PREDICTOR_128(64, 16, q)
+
+#undef DC_PREDICTOR_128
+
+#define DC_PREDICTOR_LEFT(w, h, shift, q)                                     \
+  void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                              const uint8_t *above,           \
+                                              const uint8_t *left) {          \
+    (void)above;                                                              \
+    const uint16x8_t sum = dc_load_sum_##h(left);                             \
+    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                         \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));            \
+  }
+
+DC_PREDICTOR_LEFT(4, 8, 3, )
+DC_PREDICTOR_LEFT(8, 4, 2, )
+DC_PREDICTOR_LEFT(8, 16, 4, )
+DC_PREDICTOR_LEFT(16, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 32, 5, q)
+DC_PREDICTOR_LEFT(32, 16, 4, q)
+DC_PREDICTOR_LEFT(32, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 32, 5, q)
+DC_PREDICTOR_LEFT(4, 16, 4, )
+DC_PREDICTOR_LEFT(16, 4, 2, q)
+DC_PREDICTOR_LEFT(8, 32, 5, )
+DC_PREDICTOR_LEFT(32, 8, 3, q)
+DC_PREDICTOR_LEFT(16, 64, 6, q)
+DC_PREDICTOR_LEFT(64, 16, 4, q)
+
+#undef DC_PREDICTOR_LEFT
+
+#define DC_PREDICTOR_TOP(w, h, shift, q)                                     \
+  void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                             const uint8_t *above,           \
+                                             const uint8_t *left) {          \
+    (void)left;                                                              \
+    const uint16x8_t sum = dc_load_sum_##w(above);                           \
+    const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift));                        \
+    dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0));           \
+  }
+
+DC_PREDICTOR_TOP(4, 8, 2, )
+DC_PREDICTOR_TOP(4, 16, 2, )
+DC_PREDICTOR_TOP(8, 4, 3, )
+DC_PREDICTOR_TOP(8, 16, 3, )
+DC_PREDICTOR_TOP(8, 32, 3, )
+DC_PREDICTOR_TOP(16, 4, 4, q)
+DC_PREDICTOR_TOP(16, 8, 4, q)
+DC_PREDICTOR_TOP(16, 32, 4, q)
+DC_PREDICTOR_TOP(16, 64, 4, q)
+DC_PREDICTOR_TOP(32, 8, 5, q)
+DC_PREDICTOR_TOP(32, 16, 5, q)
+DC_PREDICTOR_TOP(32, 64, 5, q)
+DC_PREDICTOR_TOP(64, 16, 6, q)
+DC_PREDICTOR_TOP(64, 32, 6, q)
+
+#undef DC_PREDICTOR_TOP
+
+// -----------------------------------------------------------------------------
+
+static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h,
+                               uint8x8_t d0) {
+  for (int i = 0; i < h; ++i) {
+    store_u8_4x1(dst + i * stride, d0);
+  }
+}
+
+static INLINE void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h,
+                               uint8x8_t d0) {
+  for (int i = 0; i < h; ++i) {
+    vst1_u8(dst + i * stride, d0);
+  }
+}
+
+static INLINE void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + i * stride, d0);
+  }
+}
+
+static INLINE void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0, uint8x16_t d1) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + 0, d0);
+    vst1q_u8(dst + 16, d1);
+    dst += stride;
+  }
+}
+
+static INLINE void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h,
+                                uint8x16_t d0, uint8x16_t d1, uint8x16_t d2,
+                                uint8x16_t d3) {
+  for (int i = 0; i < h; ++i) {
+    vst1q_u8(dst + 0, d0);
+    vst1q_u8(dst + 16, d1);
+    vst1q_u8(dst + 32, d2);
+    vst1q_u8(dst + 48, d3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_4xh(dst, stride, 4, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 8, vld1_u8(above));
+}
+
+void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 16, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 32, d0, d1);
+}
+
+void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_4xh(dst, stride, 8, load_u8_4x1(above));
+}
+
+void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_4xh(dst, stride, 16, load_u8_4x1(above));
+}
+
+void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 4, vld1_u8(above));
+}
+
+void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 16, vld1_u8(above));
+}
+
+void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_8xh(dst, stride, 32, vld1_u8(above));
+}
+
+void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 4, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 8, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 32, vld1q_u8(above));
+}
+
+void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_store_16xh(dst, stride, 64, vld1q_u8(above));
+}
+
+void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 8, d0, d1);
+}
+
+void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 16, d0, d1);
+}
+
+void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  (void)left;
+  v_store_32xh(dst, stride, 64, d0, d1);
+}
+
+void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 16, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 32, d0, d1, d2, d3);
+}
+
+void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(above);
+  const uint8x16_t d1 = vld1q_u8(above + 16);
+  const uint8x16_t d2 = vld1q_u8(above + 32);
+  const uint8x16_t d3 = vld1q_u8(above + 48);
+  (void)left;
+  v_store_64xh(dst, stride, 64, d0, d1, d2, d3);
+}
+
+// -----------------------------------------------------------------------------
+
+static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+  store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4));
+  store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5));
+  store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6));
+  store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+  vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4));
+  vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5));
+  vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6));
+  vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+}
+
+static INLINE void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) {
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6));
+  dst += stride;
+  vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7));
+  vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7));
+}
+
+void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = load_u8_4x1(left);
+  (void)above;
+  store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_8x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_16x8(dst, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_4x8(dst, stride, d0);
+}
+
+void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = load_u8_4x1(left);
+  (void)above;
+  vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0));
+  vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1));
+  vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2));
+  vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  (void)above;
+  h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = load_u8_4x1(left);
+  (void)above;
+  vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0));
+  vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1));
+  vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2));
+  vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3));
+}
+
+void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_16x8(dst, stride, d0);
+}
+
+void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  (void)above;
+  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+}
+
+void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  const uint8x16_t d2 = vld1q_u8(left + 32);
+  const uint8x16_t d3 = vld1q_u8(left + 48);
+  (void)above;
+  h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1));
+  h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2));
+  h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2));
+  h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3));
+  h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint8x8_t d0 = vld1_u8(left);
+  (void)above;
+  h_store_32x8(dst, stride, d0);
+}
+
+void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left + 0);
+  const uint8x16_t d1 = vld1q_u8(left + 16);
+  const uint8x16_t d2 = vld1q_u8(left + 32);
+  const uint8x16_t d3 = vld1q_u8(left + 48);
+  (void)above;
+  h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0));
+  h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1));
+  h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1));
+  h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2));
+  h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2));
+  h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3));
+  h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3));
+}
+
+void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const uint8x16_t d0 = vld1q_u8(left);
+  (void)above;
+  h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+  h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+}
+
+void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  for (int i = 0; i < 2; ++i) {
+    const uint8x16_t d0 = vld1q_u8(left);
+    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+    left += 16;
+    dst += 16 * stride;
+  }
+}
+
+void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  for (int i = 0; i < 4; ++i) {
+    const uint8x16_t d0 = vld1q_u8(left);
+    h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0));
+    h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0));
+    left += 16;
+    dst += 16 * stride;
+  }
+}
+
+/* ---------------------P R E D I C T I O N   Z 1--------------------------- */
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64(
+    int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    uint8x8x2_t a01_128;
+    uint16x8_t shift;
+    if (upsample_above) {
+      a01_128 = vld2_u8(above + base);
+      shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1);
+    } else {
+      a01_128.val[0] = vld1_u8(above + base);
+      a01_128.val[1] = vld1_u8(above + base + 1);
+      shift = vdupq_n_u16((x & 0x3f) >> 1);
+    }
+    uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]);
+    uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32));
+    uint16x8_t res = vmlaq_u16(a32, diff, shift);
+
+    uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]);
+    dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x);
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_lane_u32((uint32_t *)(dst + stride * i),
+                  vreinterpret_u32_u8(dstvec[i]), 0);
+  }
+}
+
+static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  uint8x8_t dstvec[32];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above,
+                                        dx);
+  for (int i = 0; i < N; i++) {
+    vst1_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon(
+    int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    if (base_max_diff > H) base_max_diff = H;
+
+    uint16x8_t shift;
+    uint8x16_t a0_128, a1_128;
+    if (upsample_above) {
+      uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base);
+      a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]);
+      a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8);
+      shift = vdupq_n_u16(x & 0x1f);
+    } else {
+      a0_128 = vld1q_u8(above + base);
+      a1_128 = vld1q_u8(above + base + 1);
+      shift = vdupq_n_u16((x & 0x3f) >> 1);
+    }
+    uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+    uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+    uint16x8_t a32_lo =
+        vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+    uint16x8_t a32_hi =
+        vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+    uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+    uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+    uint8x16_t v_temp =
+        vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+    uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]);
+    dst[r] = vbslq_u8(mask, v_temp, a_mbase_x);
+
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  uint8x16_t dstvec[64];
+
+  dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon(
+    int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) {
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i].val[0] = a_mbase_x;  // save 32 values
+        dstvec[i].val[1] = a_mbase_x;
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+
+    uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+
+    uint8x16_t res16[2];
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        uint8x16_t a0_128 = vld1q_u8(above + base + j);
+        uint8x16_t a1_128 = vld1q_u8(above + base + j + 1);
+        uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+        uint16x8_t diff_hi =
+            vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+        uint16x8_t a32_lo =
+            vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+        uint16x8_t a32_hi =
+            vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+        uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+        uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+
+        res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+      }
+    }
+
+    uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]);
+    uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16);
+    dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x);
+    dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int dx) {
+  uint8x16x2_t dstvec[64];
+
+  dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx);
+  for (int i = 0; i < N; i++) {
+    vst1q_u8(dst + stride * i, dstvec[i].val[0]);
+    vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]);
+  }
+}
+
+static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int dx) {
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+  const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]);
+  const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        vst1q_u8(dst, a_mbase_x);
+        vst1q_u8(dst + 16, a_mbase_x);
+        vst1q_u8(dst + 32, a_mbase_x);
+        vst1q_u8(dst + 48, a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1);
+    uint8x16_t base_inc128 =
+        vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100),
+                                               vcreate_u8(0x0F0E0D0C0B0A0908)));
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        vst1q_u8(dst + j, a_mbase_x);
+      } else {
+        uint8x16_t a0_128 = vld1q_u8(above + base + j);
+        uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j);
+        uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128));
+        uint16x8_t diff_hi =
+            vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128));
+        uint16x8_t a32_lo =
+            vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32));
+        uint16x8_t a32_hi =
+            vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32));
+        uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift);
+        uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift);
+        uint8x16_t v_temp =
+            vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5));
+
+        uint8x16_t mask128 =
+            vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0));
+        uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x);
+        vst1q_u8(dst + j, res128);
+
+        base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16));
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break;
+    case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break;
+    default: break;
+  }
+}
+
+/* ---------------------P R E D I C T I O N   Z 2--------------------------- */
+
+#if !AOM_ARCH_AARCH64
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = {
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff }
+};
+#endif  // !AOM_ARCH_AARCH64
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon(
+    const uint8_t *above, int upsample_above, int dx, int base_x, int y,
+    uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) {
+  uint16x4_t r6 = vcreate_u16(0x00C0008000400000);
+  uint16x4_t ydx = vdup_n_u16(y * dx);
+  if (upsample_above) {
+    // Cannot use LD2 here since we only want to load eight bytes, but LD2 can
+    // only load either 16 or 32.
+    uint8x8_t v_tmp = vld1_u8(above + base_x);
+    *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0];
+    *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1];
+    *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f));
+  } else {
+    *a0_x = load_u8_4x1(above + base_x);
+    *a1_x = load_u8_4x1(above + base_x + 1);
+    *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f));
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon(
+#if AOM_ARCH_AARCH64
+    uint8x16x2_t left_vals,
+#else
+    const uint8_t *left,
+#endif
+    int upsample_left, int dy, int r, int min_base_y, int frac_bits_y,
+    uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) {
+  int16x4_t dy64 = vdup_n_s16(dy);
+  int16x4_t v_1234 = vcreate_s16(0x0004000300020001);
+  int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
+  int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
+  int16x4_t v_r6 = vdup_n_s16(r << 6);
+  int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
+  int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
+
+  // Values in base_y_c64 range from -2 through 14 inclusive.
+  base_y_c64 = vmax_s16(base_y_c64, min_base_y64);
+
+#if AOM_ARCH_AARCH64
+  uint8x8_t left_idx0 =
+      vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2)));  // [0, 16]
+  uint8x8_t left_idx1 =
+      vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3)));  // [1, 17]
+
+  *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0));
+  *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1));
+#else   // !AOM_ARCH_AARCH64
+  DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+
+  vst1_s16(base_y_c, base_y_c64);
+  uint8x8_t a0_y_u8 = vdup_n_u8(0);
+  a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0);
+  a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2);
+  a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4);
+  a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6);
+
+  base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
+  vst1_s16(base_y_c, base_y_c64);
+  uint8x8_t a1_y_u8 = vdup_n_u8(0);
+  a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0);
+  a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2);
+  a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4);
+  a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6);
+
+  *a0_y = vreinterpret_u16_u8(a0_y_u8);
+  *a1_y = vreinterpret_u16_u8(a1_y_u8);
+#endif  // AOM_ARCH_AARCH64
+
+  if (upsample_left) {
+    *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f));
+  } else {
+    *shift1 =
+        vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f));
+  }
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon(
+    const uint8_t *above, int upsample_above, int dx, int base_x, int y) {
+  uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+                                  vcreate_u16(0x0008000700060005));
+  uint16x8_t ydx = vdupq_n_u16(y * dx);
+  uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6);
+
+  uint16x8_t shift0;
+  uint8x8_t a0_x0;
+  uint8x8_t a1_x0;
+  if (upsample_above) {
+    uint8x8x2_t v_tmp = vld2_u8(above + base_x);
+    a0_x0 = v_tmp.val[0];
+    a1_x0 = v_tmp.val[1];
+    shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+  } else {
+    a0_x0 = vld1_u8(above + base_x);
+    a1_x0 = vld1_u8(above + base_x + 1);
+    shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f));
+  }
+
+  uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0);  // a[x+1] - a[x]
+  uint16x8_t a32 =
+      vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32));  // a[x] * 32 + 16
+  uint16x8_t res = vmlaq_u16(a32, diff0, shift0);
+  return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon(
+#if AOM_ARCH_AARCH64
+    uint8x16x3_t left_vals,
+#else
+    const uint8_t *left,
+#endif
+    int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) {
+  int16x8_t v_r6 = vdupq_n_s16(r << 6);
+  int16x8_t dy128 = vdupq_n_s16(dy);
+  int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
+  int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
+
+  uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+                                  vcreate_u16(0x0008000700060005));
+  int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128);
+  int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
+
+  // Values in base_y_c128 range from -2 through 31 inclusive.
+  base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128);
+
+#if AOM_ARCH_AARCH64
+  uint8x16_t left_idx0 =
+      vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2)));  // [0, 33]
+  uint8x16_t left_idx1 =
+      vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3)));  // [1, 34]
+  uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+  uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
+  uint8x8_t a0_x1 = vget_low_u8(a01_x);
+  uint8x8_t a1_x1 = vget_high_u8(a01_x);
+#else   // !AOM_ARCH_AARCH64
+  uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128);
+  uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128);
+#endif  // AOM_ARCH_AARCH64
+
+  uint16x8_t shift1;
+  if (upsample_left) {
+    shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f));
+  } else {
+    shift1 = vshrq_n_u16(
+        vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1);
+  }
+
+  uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1);
+  uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32));
+  uint16x8_t res = vmlaq_u16(a32, diff1, shift1);
+  return vshrn_n_u16(res, 5);
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon(
+    const uint8_t *above, int dx, int base_x, int y, int j) {
+  uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004)),
+                           vcombine_u16(vcreate_u16(0x000B000A00090008),
+                                        vcreate_u16(0x000F000E000D000C)) } };
+  uint16x8_t j256 = vdupq_n_u16(j);
+  uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
+
+  const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j);
+  const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1);
+  uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6);
+  uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6);
+  uint16x8_t shift0 =
+      vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1);
+  uint16x8_t shift1 =
+      vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1);
+  // a[x+1] - a[x]
+  uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128));
+  uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128));
+  // a[x] * 32 + 16
+  uint16x8_t a32_0 =
+      vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32));
+  uint16x8_t a32_1 =
+      vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32));
+  uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0);
+  uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1);
+  return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon(
+#if AOM_ARCH_AARCH64
+    uint8x16x4_t left_vals0, uint8x16x4_t left_vals1,
+#else
+    const uint8_t *left,
+#endif
+    int dy, int r, int j) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_y = -1;
+
+  int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
+  int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1);
+  int16x8_t dy256 = vdupq_n_s16(dy);
+  uint16x8_t j256 = vdupq_n_u16(j);
+
+  uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000),
+                                        vcreate_u16(0x0007000600050004)),
+                           vcombine_u16(vcreate_u16(0x000B000A00090008),
+                                        vcreate_u16(0x000F000E000D000C)) } };
+  uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)),
+                           vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } };
+
+  int16x8_t v_r6 = vdupq_n_s16(r << 6);
+
+  int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0]));
+  int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1]));
+  int16x8_t mul16_lo = vreinterpretq_s16_u16(
+      vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)),
+                vreinterpretq_u16_s16(half_min_base_y256)));
+  int16x8_t mul16_hi = vreinterpretq_s16_u16(
+      vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)),
+                vreinterpretq_u16_s16(half_min_base_y256)));
+  int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo);
+  int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi);
+
+  int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6);
+  int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6);
+
+  base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo);
+  base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi);
+
+#if !AOM_ARCH_AARCH64
+  int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7);
+  int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0);
+  int16_t offset_diff = max_y - min_y;
+
+  uint8x8_t a0_y0;
+  uint8x8_t a0_y1;
+  uint8x8_t a1_y0;
+  uint8x8_t a1_y1;
+  if (offset_diff < 16) {
+    // Avoid gathers where the data we want is close together in memory.
+    // We don't need this for AArch64 since we can already use TBL to cover the
+    // full range of possible values.
+    assert(offset_diff >= 0);
+    int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3);
+
+    int16x8x2_t base_y_offset;
+    base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256);
+    base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256);
+
+    int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]),
+                                             vqmovn_s16(base_y_offset.val[1]));
+
+    uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]);
+    uint8x16_t a0_y128 = vld1q_u8(left + min_y);
+    uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1);
+    a0_y128 = vandq_u8(a0_y128, v_loadmaskz2);
+    a1_y128 = vandq_u8(a1_y128, v_loadmaskz2);
+
+    uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128));
+    uint8x8_t v_index_high =
+        vget_high_u8(vreinterpretq_u8_s8(base_y_offset128));
+    uint8x8x2_t v_tmp, v_res;
+    v_tmp.val[0] = vget_low_u8(a0_y128);
+    v_tmp.val[1] = vget_high_u8(a0_y128);
+    v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+    v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+    a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+    v_tmp.val[0] = vget_low_u8(a1_y128);
+    v_tmp.val[1] = vget_high_u8(a1_y128);
+    v_res.val[0] = vtbl2_u8(v_tmp, v_index_low);
+    v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
+    a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
+
+    a0_y0 = vget_low_u8(a0_y128);
+    a0_y1 = vget_high_u8(a0_y128);
+    a1_y0 = vget_low_u8(a1_y128);
+    a1_y1 = vget_high_u8(a1_y128);
+  } else {
+    a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo);
+    a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi);
+    a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo);
+    a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi);
+  }
+#else
+  // Values in left_idx{0,1} range from 0 through 63 inclusive.
+  uint8x16_t left_idx0 =
+      vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1)));
+  uint8x16_t left_idx1 =
+      vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1)));
+  uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+  uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
+  uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
+
+  uint8x8_t a0_y0 = vget_low_u8(a0_y01);
+  uint8x8_t a0_y1 = vget_high_u8(a0_y01);
+  uint8x8_t a1_y0 = vget_low_u8(a1_y01);
+  uint8x8_t a1_y1 = vget_high_u8(a1_y01);
+#endif  // !AOM_ARCH_AARCH64
+
+  uint16x8_t shifty_lo = vshrq_n_u16(
+      vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1);
+  uint16x8_t shifty_hi = vshrq_n_u16(
+      vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1);
+
+  // a[x+1] - a[x]
+  uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0);
+  uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1);
+  // a[x] * 32 + 16
+  uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32));
+  uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32));
+
+  uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo);
+  uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi);
+
+  return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5));
+}
+
+static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+  // Use ext rather than loading left + 14 directly to avoid over-read.
+  const uint8x16_t left_m2 = vld1q_u8(left - 2);
+  const uint8x16_t left_0 = vld1q_u8(left);
+  const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
+  const uint8x16x2_t left_vals = { { left_m2, left_14 } };
+#define LEFT left_vals
+#else  // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif  // AOM_ARCH_AARCH64
+
+  for (int r = 0; r < N; r++) {
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    const int base_min_diff =
+        (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >>
+        upsample_above;
+
+    if (base_min_diff <= 0) {
+      uint8x8_t a0_x_u8, a1_x_u8;
+      uint16x4_t shift0;
+      dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+                                      &a0_x_u8, &a1_x_u8, &shift0);
+      uint8x8_t a0_x = a0_x_u8;
+      uint8x8_t a1_x = a1_x_u8;
+
+      uint16x8_t diff = vsubl_u8(a1_x, a0_x);  // a[x+1] - a[x]
+      uint16x8_t a32 =
+          vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32));  // a[x] * 32 + 16
+      uint16x8_t res =
+          vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0)));
+      uint8x8_t resx = vshrn_n_u16(res, 5);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0);
+    } else if (base_min_diff < 4) {
+      uint8x8_t a0_x_u8, a1_x_u8;
+      uint16x4_t shift0;
+      dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y,
+                                      &a0_x_u8, &a1_x_u8, &shift0);
+      uint16x8_t a0_x = vmovl_u8(a0_x_u8);
+      uint16x8_t a1_x = vmovl_u8(a1_x_u8);
+
+      uint16x4_t a0_y;
+      uint16x4_t a1_y;
+      uint16x4_t shift1;
+      dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+                                     frac_bits_y, &a0_y, &a1_y, &shift1);
+      a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y);
+      a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y);
+
+      uint16x8_t shift = vcombine_u16(shift0, shift1);
+      uint16x8_t diff = vsubq_u16(a1_x, a0_x);  // a[x+1] - a[x]
+      uint16x8_t a32 =
+          vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32);  // a[x] * 32 + 16
+      uint16x8_t res = vmlaq_u16(a32, diff, shift);
+      uint8x8_t resx = vshrn_n_u16(res, 5);
+      uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4);
+
+      uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+      uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
+    } else {
+      uint16x4_t a0_y, a1_y;
+      uint16x4_t shift1;
+      dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y,
+                                     frac_bits_y, &a0_y, &a1_y, &shift1);
+      uint16x4_t diff = vsub_u16(a1_y, a0_y);                 // a[x+1] - a[x]
+      uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32);  // a[x] * 32 + 16
+      uint16x4_t res = vmla_u16(a32, diff, shift1);
+      uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5);
+
+      vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0);
+    }
+
+    dst += stride;
+  }
+#undef LEFT
+}
+
+static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+
+#if AOM_ARCH_AARCH64
+  // Use ext rather than loading left + 30 directly to avoid over-read.
+  const uint8x16_t left_m2 = vld1q_u8(left - 2);
+  const uint8x16_t left_0 = vld1q_u8(left + 0);
+  const uint8x16_t left_16 = vld1q_u8(left + 16);
+  const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
+  const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
+  const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
+#define LEFT left_vals
+#else  // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif  // AOM_ARCH_AARCH64
+
+  for (int r = 0; r < N; r++) {
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+
+    if (base_min_diff <= 0) {
+      uint8x8_t resx =
+          dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+      vst1_u8(dst, resx);
+    } else if (base_min_diff < 8) {
+      uint8x8_t resx =
+          dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y);
+      uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+          LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+      uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
+      uint8x8_t resxy = vbsl_u8(mask, resy, resx);
+      vst1_u8(dst, resxy);
+    } else {
+      uint8x8_t resy = dr_prediction_z2_Nx8_left_neon(
+          LEFT, upsample_left, dy, r, min_base_y, frac_bits_y);
+      vst1_u8(dst, resy);
+    }
+
+    dst += stride;
+  }
+#undef LEFT
+}
+
+static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+
+#if AOM_ARCH_AARCH64
+  const uint8x16_t left_m1 = vld1q_u8(left - 1);
+  const uint8x16_t left_0 = vld1q_u8(left + 0);
+  const uint8x16_t left_16 = vld1q_u8(left + 16);
+  const uint8x16_t left_32 = vld1q_u8(left + 32);
+  const uint8x16_t left_48 = vld1q_u8(left + 48);
+  const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
+  const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
+  const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
+  const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
+  const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
+#define LEFT left_vals0, left_vals1
+#else  // !AOM_ARCH_AARCH64
+#define LEFT left
+#endif  // AOM_ARCH_AARCH64
+
+  for (int r = 0; r < H; r++) {
+    int y = r + 1;
+    int base_x = (-y * dx) >> 6;
+    for (int j = 0; j < W; j += 16) {
+      const int base_min_diff = min_base_x - base_x - j;
+
+      if (base_min_diff <= 0) {
+        uint8x16_t resx =
+            dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+        vst1q_u8(dst + j, resx);
+      } else if (base_min_diff < 16) {
+        uint8x16_t resx =
+            dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j);
+        uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+        uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
+        uint8x16_t resxy = vbslq_u8(mask, resy, resx);
+        vst1q_u8(dst + j, resxy);
+      } else {
+        uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j);
+        vst1q_u8(dst + j, resy);
+      }
+    }  // for j
+    dst += stride;
+  }
+#undef LEFT
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy);
+      break;
+  }
+}
+
+/* ---------------------P R E D I C T I O N   Z 3--------------------------- */
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x,
+                                                         uint8x16x2_t *d) {
+  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+
+  d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                                              vreinterpretq_u16_u8(w1.val[0])));
+  d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                                              vreinterpretq_u16_u8(w1.val[1])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x,
+                                                        uint8x8x2_t *d) {
+  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+  *d = aom_reinterpret_u8_u16_x2(
+      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+}
+
+static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x,
+                                                        uint8x8x2_t *d) {
+  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+
+  d[0] = aom_reinterpret_u8_u16_x2(
+      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])));
+  d[1] = aom_reinterpret_u8_u16_x2(
+      vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])));
+}
+
+static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc,
+                                         uint8_t *dst, ptrdiff_t pitchDst) {
+  // The same as the normal transposes in transpose_neon.h, but with a stride
+  // between consecutive vectors of elements.
+  uint8x16_t r[16];
+  uint8x16_t d[16];
+  for (int i = 0; i < 16; i++) {
+    r[i] = vld1q_u8(src + i * pitchSrc);
+  }
+  transpose_arrays_u8_16x16(r, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * pitchDst, d[i]);
+  }
+}
+
+static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src,
+                                           ptrdiff_t pitchSrc, uint8_t *dst,
+                                           ptrdiff_t pitchDst, int width,
+                                           int height) {
+  for (int j = 0; j < height; j += 16) {
+    for (int i = 0; i < width; i += 16) {
+      z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc,
+                                   dst + j * pitchDst + i, pitchDst);
+    }
+  }
+}
+
+static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint8x8x2_t dest;
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy);
+  z3_transpose_arrays_u8_4x4(dstvec, &dest);
+  store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]);
+  store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]);
+}
+
+static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint8x8_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_8x8(dstvec, d);
+  store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]);
+}
+
+static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[4];
+  uint8x8x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy);
+  z3_transpose_arrays_u8_8x4(dstvec, d);
+  store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]);
+  store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]);
+  store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]);
+  store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  uint8x8_t dstvec[8];
+  uint8x8_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_8x8(dstvec, d);
+  store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]);
+}
+
+static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[8];
+  uint8x8_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_16x8(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1_u8(dst + i * stride, d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint8x16_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_8x16(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x16_t dstvec[4];
+  uint8x16x2_t d[2];
+
+  dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy);
+  z3_transpose_arrays_u8_16x4(dstvec, d);
+  store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]);
+  store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]);
+  store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]);
+  store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]);
+}
+
+static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[16];
+  uint8x16_t d[8];
+
+  dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_8x16(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    vst1q_u8(dst + i * stride, d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  (void)upsample_left;
+  uint8x16x2_t dstvec[16];
+  uint8x16_t d[32];
+  uint8x16_t v_zero = vdupq_n_u8(0);
+
+  dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i].val[0] = v_zero;
+    dstvec[i].val[1] = v_zero;
+  }
+  transpose_arrays_u8_32x16(dstvec, d);
+  for (int i = 0; i < 32; i++) {
+    vst1_u8(dst + i * stride, vget_low_u8(d[i]));
+  }
+}
+
+static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  uint8x8_t dstvec[32];
+  uint8x16_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_8x16(dstvec, d);
+  transpose_arrays_u8_8x16(dstvec + 16, d + 8);
+  for (int i = 0; i < 8; i++) {
+    vst1q_u8(dst + i * stride, d[i]);
+    vst1q_u8(dst + i * stride + 16, d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[16];
+  uint8x16_t d[16];
+
+  dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy);
+  transpose_arrays_u8_16x16(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + i * stride, d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  uint8x16x2_t dstvec[32];
+  uint8x16_t d[64];
+
+  dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy);
+  transpose_arrays_u8_32x16(dstvec, d);
+  transpose_arrays_u8_32x16(dstvec + 16, d + 32);
+  for (int i = 0; i < 32; i++) {
+    vst1q_u8(dst + i * stride, d[i]);
+    vst1q_u8(dst + i * stride + 16, d[i + 32]);
+  }
+}
+
+static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+
+  dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy);
+  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  uint8x16x2_t dstvec[16];
+  uint8x16_t d[32];
+
+  dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy);
+  transpose_arrays_u8_32x16(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]);
+    vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]);
+  }
+}
+
+static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[32];
+
+  dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    uint8x16_t d[16];
+    transpose_arrays_u8_16x16(dstvec + i, d);
+    for (int j = 0; j < 16; j++) {
+      vst1q_u8(dst + j * stride + i, d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  uint8_t dstT[64 * 32];
+
+  dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy);
+  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  uint8_t dstT[32 * 64];
+
+  dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy);
+  z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32);
+}
+
+static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  (void)upsample_left;
+  uint8_t dstT[64 * 16];
+
+  dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy);
+  z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8x16_t dstvec[64];
+
+  dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    uint8x16_t d[16];
+    transpose_arrays_u8_16x16(dstvec + i, d);
+    for (int j = 0; j < 16; ++j) {
+      vst1q_u8(dst + j * stride + i, d[j]);
+    }
+  }
+}
+
+typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int upsample_left,
+                                    int dy);
+
+static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = {
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, NULL, NULL, NULL, NULL, NULL },
+  { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon,
+    dr_prediction_z3_4x16_neon, NULL, NULL },
+  { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon,
+    dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL },
+  { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon,
+    dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon,
+    dr_prediction_z3_16x64_neon },
+  { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon,
+    dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon },
+  { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon,
+    dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon },
+};
+
+void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)];
+  assert(f != NULL);
+  f(dst, stride, left, upsample_left, dy);
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x8_t negate_s8(const uint8x8_t v) {
+  return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v)));
+}
+
+static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *const top_row,
+                            const uint8_t *const left_column,
+                            const int height) {
+  const uint8_t top_right = top_row[3];
+  const uint8_t bottom_left = left_column[height - 1];
+  const uint8_t *const weights_y = smooth_weights + height - 4;
+
+  uint8x8_t top_v = load_u8_4x1(top_row);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  uint8x8_t weights_x_v = load_u8_4x1(smooth_weights);
+  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+    const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v);
+    const uint16x8_t weighted_top_bl =
+        vmlal_u8(weighted_bl, weights_y_v, top_v);
+    const uint16x8_t weighted_left_tr =
+        vmlal_u8(weighted_tr, weights_x_v, left_v);
+    // Maximum value of each parameter: 0xFF00
+    const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+    const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0);
+    dst += stride;
+  } while (++y != height);
+}
+
+static INLINE uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl,
+                                       const uint16x8_t weighted_left_tr) {
+  // Maximum value of each parameter: 0xFF00
+  const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr);
+  return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE);
+}
+
+static INLINE uint8x8_t calculate_weights_and_pred(
+    const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr,
+    const uint8x8_t bottom_left, const uint8x8_t weights_x,
+    const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) {
+  const uint16x8_t weighted_top = vmull_u8(weights_y, top);
+  const uint16x8_t weighted_top_bl =
+      vmlal_u8(weighted_top, scaled_weights_y, bottom_left);
+  const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left);
+  return calculate_pred(weighted_top_bl, weighted_left_tr);
+}
+
+static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *const top_row,
+                            const uint8_t *const left_column,
+                            const int height) {
+  const uint8_t top_right = top_row[7];
+  const uint8_t bottom_left = left_column[height - 1];
+  const uint8_t *const weights_y = smooth_weights + height - 4;
+
+  const uint8x8_t top_v = vld1_u8(top_row);
+  const uint8x8_t top_right_v = vdup_n_u8(top_right);
+  const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);
+  const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4);
+  const uint8x8_t scaled_weights_x = negate_s8(weights_x_v);
+  const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v);
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left_v = vdup_n_u8(left_column[y]);
+    const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);
+    const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);
+    const uint8x8_t result =
+        calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v,
+                                   weights_x_v, scaled_weights_y, weights_y_v);
+
+    vst1_u8(dst, result);
+    dst += stride;
+  } while (++y != height);
+}
+
+#define SMOOTH_NXM(W, H)                                                       \
+  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+                                             const uint8_t *above,             \
+                                             const uint8_t *left) {            \
+    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
+  }
+
+SMOOTH_NXM(4, 4)
+SMOOTH_NXM(4, 8)
+SMOOTH_NXM(8, 4)
+SMOOTH_NXM(8, 8)
+SMOOTH_NXM(4, 16)
+SMOOTH_NXM(8, 16)
+SMOOTH_NXM(8, 32)
+
+#undef SMOOTH_NXM
+
+static INLINE uint8x16_t calculate_weights_and_predq(
+    const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right,
+    const uint8x8_t weights_y, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) {
+  const uint16x8_t weighted_top_bl_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_low =
+      calculate_pred(weighted_top_bl_low, weighted_left_tr_low);
+
+  const uint16x8_t weighted_top_bl_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t result_high =
+      calculate_pred(weighted_top_bl_high, weighted_left_tr_high);
+
+  return vcombine_u8(result_low, result_high);
+}
+
+// 256 - v = vneg_s8(v)
+static INLINE uint8x16_t negate_s8q(const uint8x16_t v) {
+  return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v)));
+}
+
+// For width 16 and above.
+#define SMOOTH_PREDICTOR(W)                                                 \
+  static void smooth_##W##xh_neon(                                          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
+      const uint8_t *const left_column, const int height) {                 \
+    const uint8_t top_right = top_row[(W)-1];                               \
+    const uint8_t bottom_left = left_column[height - 1];                    \
+    const uint8_t *const weights_y = smooth_weights + height - 4;           \
+                                                                            \
+    uint8x16_t top_v[4];                                                    \
+    top_v[0] = vld1q_u8(top_row);                                           \
+    if ((W) > 16) {                                                         \
+      top_v[1] = vld1q_u8(top_row + 16);                                    \
+      if ((W) == 64) {                                                      \
+        top_v[2] = vld1q_u8(top_row + 32);                                  \
+        top_v[3] = vld1q_u8(top_row + 48);                                  \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);                 \
+                                                                            \
+    uint8x16_t weights_x_v[4];                                              \
+    weights_x_v[0] = vld1q_u8(smooth_weights + (W)-4);                      \
+    if ((W) > 16) {                                                         \
+      weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);             \
+      if ((W) == 64) {                                                      \
+        weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);           \
+        weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);           \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    uint8x16_t scaled_weights_x[4];                                         \
+    scaled_weights_x[0] = negate_s8q(weights_x_v[0]);                       \
+    if ((W) > 16) {                                                         \
+      scaled_weights_x[1] = negate_s8q(weights_x_v[1]);                     \
+      if ((W) == 64) {                                                      \
+        scaled_weights_x[2] = negate_s8q(weights_x_v[2]);                   \
+        scaled_weights_x[3] = negate_s8q(weights_x_v[3]);                   \
+      }                                                                     \
+    }                                                                       \
+                                                                            \
+    for (int y = 0; y < height; ++y) {                                      \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);                \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);            \
+      const uint16x8_t weighted_bl =                                        \
+          vmull_u8(scaled_weights_y, bottom_left_v);                        \
+                                                                            \
+      vst1q_u8(dst, calculate_weights_and_predq(                            \
+                        top_v[0], left_v, top_right_v, weights_y_v,         \
+                        weights_x_v[0], scaled_weights_x[0], weighted_bl)); \
+                                                                            \
+      if ((W) > 16) {                                                       \
+        vst1q_u8(dst + 16,                                                  \
+                 calculate_weights_and_predq(                               \
+                     top_v[1], left_v, top_right_v, weights_y_v,            \
+                     weights_x_v[1], scaled_weights_x[1], weighted_bl));    \
+        if ((W) == 64) {                                                    \
+          vst1q_u8(dst + 32,                                                \
+                   calculate_weights_and_predq(                             \
+                       top_v[2], left_v, top_right_v, weights_y_v,          \
+                       weights_x_v[2], scaled_weights_x[2], weighted_bl));  \
+          vst1q_u8(dst + 48,                                                \
+                   calculate_weights_and_predq(                             \
+                       top_v[3], left_v, top_right_v, weights_y_v,          \
+                       weights_x_v[3], scaled_weights_x[3], weighted_bl));  \
+        }                                                                   \
+      }                                                                     \
+                                                                            \
+      dst += stride;                                                        \
+    }                                                                       \
+  }
+
+SMOOTH_PREDICTOR(16)
+SMOOTH_PREDICTOR(32)
+SMOOTH_PREDICTOR(64)
+
+#undef SMOOTH_PREDICTOR
+
+#define SMOOTH_NXM_WIDE(W, H)                                                  \
+  void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \
+                                             const uint8_t *above,             \
+                                             const uint8_t *left) {            \
+    smooth_##W##xh_neon(dst, y_stride, above, left, H);                        \
+  }
+
+SMOOTH_NXM_WIDE(16, 4)
+SMOOTH_NXM_WIDE(16, 8)
+SMOOTH_NXM_WIDE(16, 16)
+SMOOTH_NXM_WIDE(16, 32)
+SMOOTH_NXM_WIDE(16, 64)
+SMOOTH_NXM_WIDE(32, 8)
+SMOOTH_NXM_WIDE(32, 16)
+SMOOTH_NXM_WIDE(32, 32)
+SMOOTH_NXM_WIDE(32, 64)
+SMOOTH_NXM_WIDE(64, 16)
+SMOOTH_NXM_WIDE(64, 32)
+SMOOTH_NXM_WIDE(64, 64)
+
+#undef SMOOTH_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_V_PREDICTOR(W)                                         \
+  static void smooth_v_##W##xh_neon(                                  \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,   \
+      const uint8_t *const left_column, const int height) {           \
+    const uint8_t bottom_left = left_column[height - 1];              \
+    const uint8_t *const weights_y = smooth_weights + height - 4;     \
+                                                                      \
+    uint8x8_t top_v;                                                  \
+    if ((W) == 4) {                                                   \
+      top_v = load_u8_4x1(top_row);                                   \
+    } else { /* width == 8 */                                         \
+      top_v = vld1_u8(top_row);                                       \
+    }                                                                 \
+                                                                      \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);           \
+                                                                      \
+    assert(height > 0);                                               \
+    int y = 0;                                                        \
+    do {                                                              \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);          \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);      \
+                                                                      \
+      const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v);   \
+      const uint16x8_t weighted_top_bl =                              \
+          vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v);    \
+      const uint8x8_t pred =                                          \
+          vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE);    \
+                                                                      \
+      if ((W) == 4) {                                                 \
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \
+      } else { /* width == 8 */                                       \
+        vst1_u8(dst, pred);                                           \
+      }                                                               \
+      dst += stride;                                                  \
+    } while (++y != height);                                          \
+  }
+
+SMOOTH_V_PREDICTOR(4)
+SMOOTH_V_PREDICTOR(8)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM(W, H)                                    \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM(4, 4)
+SMOOTH_V_NXM(4, 8)
+SMOOTH_V_NXM(4, 16)
+SMOOTH_V_NXM(8, 4)
+SMOOTH_V_NXM(8, 8)
+SMOOTH_V_NXM(8, 16)
+SMOOTH_V_NXM(8, 32)
+
+#undef SMOOTH_V_NXM
+
+static INLINE uint8x16_t calculate_vertical_weights_and_pred(
+    const uint8x16_t top, const uint8x8_t weights_y,
+    const uint16x8_t weighted_bl) {
+  const uint16x8_t pred_low =
+      vmlal_u8(weighted_bl, weights_y, vget_low_u8(top));
+  const uint16x8_t pred_high =
+      vmlal_u8(weighted_bl, weights_y, vget_high_u8(top));
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE);
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_V_PREDICTOR(W)                                            \
+  static void smooth_v_##W##xh_neon(                                     \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,      \
+      const uint8_t *const left_column, const int height) {              \
+    const uint8_t bottom_left = left_column[height - 1];                 \
+    const uint8_t *const weights_y = smooth_weights + height - 4;        \
+                                                                         \
+    uint8x16_t top_v[4];                                                 \
+    top_v[0] = vld1q_u8(top_row);                                        \
+    if ((W) > 16) {                                                      \
+      top_v[1] = vld1q_u8(top_row + 16);                                 \
+      if ((W) == 64) {                                                   \
+        top_v[2] = vld1q_u8(top_row + 32);                               \
+        top_v[3] = vld1q_u8(top_row + 48);                               \
+      }                                                                  \
+    }                                                                    \
+                                                                         \
+    const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left);              \
+                                                                         \
+    assert(height > 0);                                                  \
+    int y = 0;                                                           \
+    do {                                                                 \
+      const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]);             \
+      const uint8x8_t scaled_weights_y = negate_s8(weights_y_v);         \
+      const uint16x8_t weighted_bl =                                     \
+          vmull_u8(scaled_weights_y, bottom_left_v);                     \
+                                                                         \
+      const uint8x16_t pred_0 = calculate_vertical_weights_and_pred(     \
+          top_v[0], weights_y_v, weighted_bl);                           \
+      vst1q_u8(dst, pred_0);                                             \
+                                                                         \
+      if ((W) > 16) {                                                    \
+        const uint8x16_t pred_1 = calculate_vertical_weights_and_pred(   \
+            top_v[1], weights_y_v, weighted_bl);                         \
+        vst1q_u8(dst + 16, pred_1);                                      \
+                                                                         \
+        if ((W) == 64) {                                                 \
+          const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \
+              top_v[2], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 32, pred_2);                                    \
+                                                                         \
+          const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \
+              top_v[3], weights_y_v, weighted_bl);                       \
+          vst1q_u8(dst + 48, pred_3);                                    \
+        }                                                                \
+      }                                                                  \
+                                                                         \
+      dst += stride;                                                     \
+    } while (++y != height);                                             \
+  }
+
+SMOOTH_V_PREDICTOR(16)
+SMOOTH_V_PREDICTOR(32)
+SMOOTH_V_PREDICTOR(64)
+
+#undef SMOOTH_V_PREDICTOR
+
+#define SMOOTH_V_NXM_WIDE(W, H)                               \
+  void aom_smooth_v_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_v_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_V_NXM_WIDE(16, 4)
+SMOOTH_V_NXM_WIDE(16, 8)
+SMOOTH_V_NXM_WIDE(16, 16)
+SMOOTH_V_NXM_WIDE(16, 32)
+SMOOTH_V_NXM_WIDE(16, 64)
+SMOOTH_V_NXM_WIDE(32, 8)
+SMOOTH_V_NXM_WIDE(32, 16)
+SMOOTH_V_NXM_WIDE(32, 32)
+SMOOTH_V_NXM_WIDE(32, 64)
+SMOOTH_V_NXM_WIDE(64, 16)
+SMOOTH_V_NXM_WIDE(64, 32)
+SMOOTH_V_NXM_WIDE(64, 64)
+
+#undef SMOOTH_V_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+
+// For widths 4 and 8.
+#define SMOOTH_H_PREDICTOR(W)                                               \
+  static void smooth_h_##W##xh_neon(                                        \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,         \
+      const uint8_t *const left_column, const int height) {                 \
+    const uint8_t top_right = top_row[(W)-1];                               \
+                                                                            \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                     \
+    /* Over-reads for 4xN but still within the array. */                    \
+    const uint8x8_t weights_x = vld1_u8(smooth_weights + (W)-4);            \
+    const uint8x8_t scaled_weights_x = negate_s8(weights_x);                \
+    const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \
+                                                                            \
+    assert(height > 0);                                                     \
+    int y = 0;                                                              \
+    do {                                                                    \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                   \
+      const uint16x8_t weighted_left_tr =                                   \
+          vmlal_u8(weighted_tr, weights_x, left_v);                         \
+      const uint8x8_t pred =                                                \
+          vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE);         \
+                                                                            \
+      if ((W) == 4) {                                                       \
+        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0);       \
+      } else { /* width == 8 */                                             \
+        vst1_u8(dst, pred);                                                 \
+      }                                                                     \
+      dst += stride;                                                        \
+    } while (++y != height);                                                \
+  }
+
+SMOOTH_H_PREDICTOR(4)
+SMOOTH_H_PREDICTOR(8)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM(W, H)                                    \
+  void aom_smooth_h_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_H_NXM(4, 4)
+SMOOTH_H_NXM(4, 8)
+SMOOTH_H_NXM(4, 16)
+SMOOTH_H_NXM(8, 4)
+SMOOTH_H_NXM(8, 8)
+SMOOTH_H_NXM(8, 16)
+SMOOTH_H_NXM(8, 32)
+
+#undef SMOOTH_H_NXM
+
+static INLINE uint8x16_t calculate_horizontal_weights_and_pred(
+    const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x,
+    const uint8x16_t scaled_weights_x) {
+  const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_low =
+      vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_low =
+      vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE);
+
+  const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left);
+  const uint16x8_t weighted_left_tr_high =
+      vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right);
+  const uint8x8_t pred_scaled_high =
+      vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE);
+
+  return vcombine_u8(pred_scaled_low, pred_scaled_high);
+}
+
+// For width 16 and above.
+#define SMOOTH_H_PREDICTOR(W)                                              \
+  static void smooth_h_##W##xh_neon(                                       \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row,        \
+      const uint8_t *const left_column, const int height) {                \
+    const uint8_t top_right = top_row[(W)-1];                              \
+                                                                           \
+    const uint8x8_t top_right_v = vdup_n_u8(top_right);                    \
+                                                                           \
+    uint8x16_t weights_x[4];                                               \
+    weights_x[0] = vld1q_u8(smooth_weights + (W)-4);                       \
+    if ((W) > 16) {                                                        \
+      weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4);              \
+      if ((W) == 64) {                                                     \
+        weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4);            \
+        weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4);            \
+      }                                                                    \
+    }                                                                      \
+                                                                           \
+    uint8x16_t scaled_weights_x[4];                                        \
+    scaled_weights_x[0] = negate_s8q(weights_x[0]);                        \
+    if ((W) > 16) {                                                        \
+      scaled_weights_x[1] = negate_s8q(weights_x[1]);                      \
+      if ((W) == 64) {                                                     \
+        scaled_weights_x[2] = negate_s8q(weights_x[2]);                    \
+        scaled_weights_x[3] = negate_s8q(weights_x[3]);                    \
+      }                                                                    \
+    }                                                                      \
+                                                                           \
+    assert(height > 0);                                                    \
+    int y = 0;                                                             \
+    do {                                                                   \
+      const uint8x8_t left_v = vdup_n_u8(left_column[y]);                  \
+                                                                           \
+      const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred(     \
+          left_v, top_right_v, weights_x[0], scaled_weights_x[0]);         \
+      vst1q_u8(dst, pred_0);                                               \
+                                                                           \
+      if ((W) > 16) {                                                      \
+        const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred(   \
+            left_v, top_right_v, weights_x[1], scaled_weights_x[1]);       \
+        vst1q_u8(dst + 16, pred_1);                                        \
+                                                                           \
+        if ((W) == 64) {                                                   \
+          const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \
+              left_v, top_right_v, weights_x[2], scaled_weights_x[2]);     \
+          vst1q_u8(dst + 32, pred_2);                                      \
+                                                                           \
+          const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \
+              left_v, top_right_v, weights_x[3], scaled_weights_x[3]);     \
+          vst1q_u8(dst + 48, pred_3);                                      \
+        }                                                                  \
+      }                                                                    \
+      dst += stride;                                                       \
+    } while (++y != height);                                               \
+  }
+
+SMOOTH_H_PREDICTOR(16)
+SMOOTH_H_PREDICTOR(32)
+SMOOTH_H_PREDICTOR(64)
+
+#undef SMOOTH_H_PREDICTOR
+
+#define SMOOTH_H_NXM_WIDE(W, H)                               \
+  void aom_smooth_h_predictor_##W##x##H##_neon(               \
+      uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \
+      const uint8_t *left) {                                  \
+    smooth_h_##W##xh_neon(dst, y_stride, above, left, H);     \
+  }
+
+SMOOTH_H_NXM_WIDE(16, 4)
+SMOOTH_H_NXM_WIDE(16, 8)
+SMOOTH_H_NXM_WIDE(16, 16)
+SMOOTH_H_NXM_WIDE(16, 32)
+SMOOTH_H_NXM_WIDE(16, 64)
+SMOOTH_H_NXM_WIDE(32, 8)
+SMOOTH_H_NXM_WIDE(32, 16)
+SMOOTH_H_NXM_WIDE(32, 32)
+SMOOTH_H_NXM_WIDE(32, 64)
+SMOOTH_H_NXM_WIDE(64, 16)
+SMOOTH_H_NXM_WIDE(64, 32)
+SMOOTH_H_NXM_WIDE(64, 64)
+
+#undef SMOOTH_H_NXM_WIDE
+
+// -----------------------------------------------------------------------------
+// PAETH
+
+static INLINE void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+                                       const uint8_t *const top_row,
+                                       const uint8_t *const left_column,
+                                       int width, int height) {
+  const uint8x8_t top_left = vdup_n_u8(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint8x8_t top;
+  if (width == 4) {
+    top = load_u8_4x1(top_row);
+  } else {  // width == 8
+    top = vld1_u8(top_row);
+  }
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x8_t left = vdup_n_u8(left_column[y]);
+
+    const uint8x8_t left_dist = vabd_u8(top, top_left);
+    const uint8x8_t top_dist = vabd_u8(left, top_left);
+    const uint16x8_t top_left_dist =
+        vabdq_u16(vaddl_u8(top, left), top_left_x2);
+
+    const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist);
+    const uint8x8_t left_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist));
+    const uint8x8_t top_le_top_left =
+        vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist));
+
+    // if (left_dist <= top_dist && left_dist <= top_left_dist)
+    const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left);
+    //   dest[x] = left_column[y];
+    // Fill all the unused spaces with 'top'. They will be overwritten when
+    // the positions for top_left are known.
+    uint8x8_t result = vbsl_u8(left_mask, left, top);
+    // else if (top_dist <= top_left_dist)
+    //   dest[x] = top_row[x];
+    // Add these values to the mask. They were already set.
+    const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left);
+    // else
+    //   dest[x] = top_left;
+    result = vbsl_u8(left_or_top_mask, result, top_left);
+
+    if (width == 4) {
+      store_u8_4x1(dest, result);
+    } else {  // width == 8
+      vst1_u8(dest, result);
+    }
+    dest += stride;
+  } while (++y != height);
+}
+
+#define PAETH_NXM(W, H)                                                     \
+  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                            const uint8_t *above,           \
+                                            const uint8_t *left) {          \
+    paeth_4or8_x_h_neon(dst, stride, above, left, W, H);                    \
+  }
+
+PAETH_NXM(4, 4)
+PAETH_NXM(4, 8)
+PAETH_NXM(8, 4)
+PAETH_NXM(8, 8)
+PAETH_NXM(8, 16)
+
+PAETH_NXM(4, 16)
+PAETH_NXM(8, 32)
+
+// Calculate X distance <= TopLeft distance and pack the resulting mask into
+// uint8x8_t.
+static INLINE uint8x16_t x_le_top_left(const uint8x16_t x_dist,
+                                       const uint16x8_t top_left_dist_low,
+                                       const uint16x8_t top_left_dist_high) {
+  const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low),
+                                               vqmovn_u16(top_left_dist_high));
+  return vcleq_u8(x_dist, top_left_dist);
+}
+
+// Select the closest values and collect them.
+static INLINE uint8x16_t select_paeth(const uint8x16_t top,
+                                      const uint8x16_t left,
+                                      const uint8x16_t top_left,
+                                      const uint8x16_t left_le_top,
+                                      const uint8x16_t left_le_top_left,
+                                      const uint8x16_t top_le_top_left) {
+  // if (left_dist <= top_dist && left_dist <= top_left_dist)
+  const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left);
+  //   dest[x] = left_column[y];
+  // Fill all the unused spaces with 'top'. They will be overwritten when
+  // the positions for top_left are known.
+  uint8x16_t result = vbslq_u8(left_mask, left, top);
+  // else if (top_dist <= top_left_dist)
+  //   dest[x] = top_row[x];
+  // Add these values to the mask. They were already set.
+  const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left);
+  // else
+  //   dest[x] = top_left;
+  return vbslq_u8(left_or_top_mask, result, top_left);
+}
+
+// Generate numbered and high/low versions of top_left_dist.
+#define TOP_LEFT_DIST(num)                                              \
+  const uint16x8_t top_left_##num##_dist_low = vabdq_u16(               \
+      vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \
+  const uint16x8_t top_left_##num##_dist_high = vabdq_u16(              \
+      vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2)
+
+// Generate numbered versions of XLeTopLeft with x = left.
+#define LEFT_LE_TOP_LEFT(num)                                     \
+  const uint8x16_t left_le_top_left_##num =                       \
+      x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \
+                    top_left_##num##_dist_high)
+
+// Generate numbered versions of XLeTopLeft with x = top.
+#define TOP_LE_TOP_LEFT(num)                              \
+  const uint8x16_t top_le_top_left_##num = x_le_top_left( \
+      top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high)
+
+static INLINE void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride,
+                                         const uint8_t *const top_row,
+                                         const uint8_t *const left_column,
+                                         int width, int height) {
+  const uint8x16_t top_left = vdupq_n_u8(top_row[-1]);
+  const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]);
+  uint8x16_t top[4];
+  top[0] = vld1q_u8(top_row);
+  if (width > 16) {
+    top[1] = vld1q_u8(top_row + 16);
+    if (width == 64) {
+      top[2] = vld1q_u8(top_row + 32);
+      top[3] = vld1q_u8(top_row + 48);
+    }
+  }
+
+  assert(height > 0);
+  int y = 0;
+  do {
+    const uint8x16_t left = vdupq_n_u8(left_column[y]);
+
+    const uint8x16_t top_dist = vabdq_u8(left, top_left);
+
+    const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left);
+    TOP_LEFT_DIST(0);
+    const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist);
+    LEFT_LE_TOP_LEFT(0);
+    TOP_LE_TOP_LEFT(0);
+
+    const uint8x16_t result_0 =
+        select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0,
+                     top_le_top_left_0);
+    vst1q_u8(dest, result_0);
+
+    if (width > 16) {
+      const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left);
+      TOP_LEFT_DIST(1);
+      const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist);
+      LEFT_LE_TOP_LEFT(1);
+      TOP_LE_TOP_LEFT(1);
+
+      const uint8x16_t result_1 =
+          select_paeth(top[1], left, top_left, left_1_le_top,
+                       left_le_top_left_1, top_le_top_left_1);
+      vst1q_u8(dest + 16, result_1);
+
+      if (width == 64) {
+        const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left);
+        TOP_LEFT_DIST(2);
+        const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist);
+        LEFT_LE_TOP_LEFT(2);
+        TOP_LE_TOP_LEFT(2);
+
+        const uint8x16_t result_2 =
+            select_paeth(top[2], left, top_left, left_2_le_top,
+                         left_le_top_left_2, top_le_top_left_2);
+        vst1q_u8(dest + 32, result_2);
+
+        const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left);
+        TOP_LEFT_DIST(3);
+        const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist);
+        LEFT_LE_TOP_LEFT(3);
+        TOP_LE_TOP_LEFT(3);
+
+        const uint8x16_t result_3 =
+            select_paeth(top[3], left, top_left, left_3_le_top,
+                         left_le_top_left_3, top_le_top_left_3);
+        vst1q_u8(dest + 48, result_3);
+      }
+    }
+
+    dest += stride;
+  } while (++y != height);
+}
+
+#define PAETH_NXM_WIDE(W, H)                                                \
+  void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \
+                                            const uint8_t *above,           \
+                                            const uint8_t *left) {          \
+    paeth16_plus_x_h_neon(dst, stride, above, left, W, H);                  \
+  }
+
+PAETH_NXM_WIDE(16, 8)
+PAETH_NXM_WIDE(16, 16)
+PAETH_NXM_WIDE(16, 32)
+PAETH_NXM_WIDE(32, 16)
+PAETH_NXM_WIDE(32, 32)
+PAETH_NXM_WIDE(32, 64)
+PAETH_NXM_WIDE(64, 32)
+PAETH_NXM_WIDE(64, 64)
+
+PAETH_NXM_WIDE(16, 4)
+PAETH_NXM_WIDE(16, 64)
+PAETH_NXM_WIDE(32, 8)
+PAETH_NXM_WIDE(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/loopfilter_neon.c b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
new file mode 100644
index 0000000000..7c64be1253
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/loopfilter_neon.c
@@ -0,0 +1,1045 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/transpose_neon.h"
+
+static INLINE uint8x8_t lpf_mask(uint8x8_t p3q3, uint8x8_t p2q2, uint8x8_t p1q1,
+                                 uint8x8_t p0q0, const uint8_t blimit,
+                                 const uint8_t limit) {
+  // Calculate mask values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p3q3, p2q2);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p2q2, p1q1));
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+                                  const uint8_t blimit, const uint8_t limit) {
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  uint8x8_t mask_8x8, temp_8x8;
+
+  mask_8x8 = vabd_u8(p1q1, p0q0);
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
+                                       uint8x8_t p1q1, uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p3q3, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_flat_mask3(uint8x8_t p2q2, uint8x8_t p1q1,
+                                       uint8x8_t p0q0) {
+  const uint8x8_t thresh_8x8 = vdup_n_u8(1);  // for bd==8 threshold is always 1
+  uint8x8_t flat_8x8, temp_8x8;
+
+  flat_8x8 = vabd_u8(p1q1, p0q0);
+  flat_8x8 = vmax_u8(flat_8x8, vabd_u8(p2q2, p0q0));
+  flat_8x8 = vcle_u8(flat_8x8, thresh_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(flat_8x8)));
+  flat_8x8 = vand_u8(flat_8x8, temp_8x8);
+
+  return flat_8x8;
+}
+
+static INLINE uint8x8_t lpf_mask3_chroma(uint8x8_t p2q2, uint8x8_t p1q1,
+                                         uint8x8_t p0q0, const uint8_t blimit,
+                                         const uint8_t limit) {
+  // Calculate mask3 values for four samples
+  uint32x2x2_t p0q0_p1q1;
+  uint16x8_t temp_16x8;
+  uint16x4_t temp0_16x4, temp1_16x4;
+  uint8x8_t mask_8x8, temp_8x8;
+  const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+  const uint16x4_t blimit_16x4 = vdup_n_u16((uint16_t)blimit);
+
+  mask_8x8 = vabd_u8(p2q2, p1q1);
+  mask_8x8 = vmax_u8(mask_8x8, vabd_u8(p1q1, p0q0));
+  mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+  temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+  temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+                     vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+  temp_16x8 = vmovl_u8(temp_8x8);
+  temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+  temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+  temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+  temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+  temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+  mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+  return mask_8x8;
+}
+
+static void lpf_14_neon(uint8x8_t *p6q6, uint8x8_t *p5q5, uint8x8_t *p4q4,
+                        uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                        uint8x8_t *p0q0, const uint8_t blimit,
+                        const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f14_pq0, out_f14_pq1, out_f14_pq2, out_f14_pq3, out_f14_pq4,
+      out_f14_pq5;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8, flat2_8x8;
+  uint8x8_t q0p0, q1p1, q2p2;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  flat2_8x8 = lpf_flat_mask4(*p6q6, *p5q5, *p4q4, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  // reverse p and q
+  q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+  q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+  q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    // filter 14
+    uint16x8_t out_pq0, out_pq1, out_pq2, out_pq3, out_pq4, out_pq5;
+    uint16x8_t p6q6_2, p6q6_temp, qp_sum;
+    uint8x8_t qp_rev;
+
+    out = vaddw_u8(out, *p4q4);
+    out = vaddw_u8(out, *p5q5);
+    out = vaddw_u8(out, *p6q6);
+
+    out_pq5 = vaddw_u8(out, *p4q4);
+    out_pq4 = vaddw_u8(out_pq5, *p3q3);
+    out_pq3 = vaddw_u8(out_pq4, *p2q2);
+
+    out_pq5 = vaddw_u8(out_pq5, *p5q5);
+    out_pq4 = vaddw_u8(out_pq4, *p5q5);
+
+    out_pq0 = vaddw_u8(out, *p1q1);
+    out_pq1 = vaddw_u8(out_pq0, *p2q2);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+
+    out_pq0 = vaddw_u8(out_pq0, *p0q0);
+    out_pq1 = vaddw_u8(out_pq1, *p0q0);
+
+    out_pq1 = vaddw_u8(out_pq1, *p6q6);
+    p6q6_2 = vaddl_u8(*p6q6, *p6q6);
+    out_pq2 = vaddq_u16(out_pq2, p6q6_2);
+    p6q6_temp = vaddw_u8(p6q6_2, *p6q6);
+    out_pq3 = vaddq_u16(out_pq3, p6q6_temp);
+    p6q6_temp = vaddw_u8(p6q6_temp, *p6q6);
+    out_pq4 = vaddq_u16(out_pq4, p6q6_temp);
+    p6q6_temp = vaddq_u16(p6q6_temp, p6q6_2);
+    out_pq5 = vaddq_u16(out_pq5, p6q6_temp);
+
+    out_pq4 = vaddw_u8(out_pq4, q1p1);
+
+    qp_sum = vaddl_u8(q2p2, q1p1);
+    out_pq3 = vaddq_u16(out_pq3, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p3q3)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq2 = vaddq_u16(out_pq2, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p4q4)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq1 = vaddq_u16(out_pq1, qp_sum);
+
+    qp_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p5q5)));
+    qp_sum = vaddw_u8(qp_sum, qp_rev);
+    out_pq0 = vaddq_u16(out_pq0, qp_sum);
+
+    out_pq0 = vaddw_u8(out_pq0, q0p0);
+
+    out_f14_pq0 = vrshrn_n_u16(out_pq0, 4);
+    out_f14_pq1 = vrshrn_n_u16(out_pq1, 4);
+    out_f14_pq2 = vrshrn_n_u16(out_pq2, 4);
+    out_f14_pq3 = vrshrn_n_u16(out_pq3, 4);
+    out_f14_pq4 = vrshrn_n_u16(out_pq4, 4);
+    out_f14_pq5 = vrshrn_n_u16(out_pq5, 4);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond, filter14_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+    filter14_cond = vand_u8(filter8_cond, flat2_8x8);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+
+    // filter14 outputs
+    *p0q0 = vbsl_u8(filter14_cond, out_f14_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter14_cond, out_f14_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter14_cond, out_f14_pq2, *p2q2);
+    *p3q3 = vbsl_u8(filter14_cond, out_f14_pq3, *p3q3);
+    *p4q4 = vbsl_u8(filter14_cond, out_f14_pq4, *p4q4);
+    *p5q5 = vbsl_u8(filter14_cond, out_f14_pq5, *p5q5);
+  }
+}
+
+static void lpf_8_neon(uint8x8_t *p3q3, uint8x8_t *p2q2, uint8x8_t *p1q1,
+                       uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f7_pq0, out_f7_pq1, out_f7_pq2;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask(*p3q3, *p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask4(*p3q3, *p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    hev_8x8 = vmvn_s8(hev_8x8);
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 8
+    uint16x8_t out_pq0, out_pq1, out_pq2;
+    uint8x8_t q0p0, q1p1, q2p2;
+
+    out = vaddl_u8(*p3q3, *p2q2);
+    out = vaddw_u8(out, *p1q1);
+    out = vaddw_u8(out, *p0q0);
+
+    // reverse p and q
+    q0p0 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    q1p1 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    q2p2 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p2q2)));
+
+    out = vaddw_u8(out, q0p0);
+    out_pq1 = vaddw_u8(out, *p3q3);
+    out_pq2 = vaddw_u8(out_pq1, *p3q3);
+    out_pq2 = vaddw_u8(out_pq2, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p1q1);
+    out_pq1 = vaddw_u8(out_pq1, q1p1);
+
+    out_pq0 = vaddw_u8(out, *p0q0);
+    out_pq0 = vaddw_u8(out_pq0, q1p1);
+    out_pq0 = vaddw_u8(out_pq0, q2p2);
+
+    out_f7_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f7_pq1 = vrshrn_n_u16(out_pq1, 3);
+    out_f7_pq2 = vrshrn_n_u16(out_pq2, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter8_cond;
+    filter8_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter8_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter8 outputs
+    *p0q0 = vbsl_u8(filter8_cond, out_f7_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter8_cond, out_f7_pq1, *p1q1);
+    *p2q2 = vbsl_u8(filter8_cond, out_f7_pq2, *p2q2);
+  }
+}
+
+static void lpf_6_neon(uint8x8_t *p2q2, uint8x8_t *p1q1, uint8x8_t *p0q0,
+                       const uint8_t blimit, const uint8_t limit,
+                       const uint8_t thresh) {
+  uint16x8_t out;
+  uint8x8_t out_f6_pq0, out_f6_pq1;
+  uint8x8_t out_f4_pq0, out_f4_pq1;
+  uint8x8_t mask_8x8, flat_8x8;
+
+  // Calculate filter masks
+  mask_8x8 = lpf_mask3_chroma(*p2q2, *p1q1, *p0q0, blimit, limit);
+  flat_8x8 = lpf_flat_mask3(*p2q2, *p1q1, *p0q0);
+  {
+    // filter 4
+    int32x2x2_t ps0_qs0, ps1_qs1;
+    int16x8_t filter_s16;
+    const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+    uint8x8_t temp0_8x8, temp1_8x8;
+    int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+    int8x8_t op0, oq0, op1, oq1;
+    int8x8_t pq_s0, pq_s1;
+    int8x8_t filter_s8, filter1_s8, filter2_s8;
+    int8x8_t hev_8x8;
+    const int8x8_t sign_mask = vdup_n_s8(0x80);
+    const int8x8_t val_4 = vdup_n_s8(4);
+    const int8x8_t val_3 = vdup_n_s8(3);
+
+    pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+    pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+    ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+    ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+    ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+    qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+    ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+    qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+    // hev_mask
+    temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+    temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+    hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+    // add outer taps if we have high edge variance
+    filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+    filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+    // inner taps
+    temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+    filter_s16 = vmovl_s8(filter_s8);
+    filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+    filter_s8 = vqmovn_s16(filter_s16);
+    filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+    filter1_s8 = vqadd_s8(filter_s8, val_4);
+    filter2_s8 = vqadd_s8(filter_s8, val_3);
+    filter1_s8 = vshr_n_s8(filter1_s8, 3);
+    filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+    oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+    op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+    filter_s8 = vrshr_n_s8(filter1_s8, 1);
+    filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+    oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+    op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+    out_f4_pq0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+    out_f4_pq1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+  }
+  {
+    // filter 6
+    uint16x8_t out_pq0, out_pq1;
+    uint8x8_t pq_rev;
+
+    out = vaddl_u8(*p0q0, *p1q1);
+    out = vaddq_u16(out, out);
+    out = vaddw_u8(out, *p2q2);
+
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p0q0)));
+    out = vaddw_u8(out, pq_rev);
+
+    out_pq0 = vaddw_u8(out, pq_rev);
+    pq_rev = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(*p1q1)));
+    out_pq0 = vaddw_u8(out_pq0, pq_rev);
+
+    out_pq1 = vaddw_u8(out, *p2q2);
+    out_pq1 = vaddw_u8(out_pq1, *p2q2);
+
+    out_f6_pq0 = vrshrn_n_u16(out_pq0, 3);
+    out_f6_pq1 = vrshrn_n_u16(out_pq1, 3);
+  }
+  {
+    uint8x8_t filter4_cond, filter6_cond;
+    filter6_cond = vand_u8(flat_8x8, mask_8x8);
+    filter4_cond = vmvn_u8(filter6_cond);
+
+    // filter4 outputs
+    *p0q0 = vbsl_u8(filter4_cond, out_f4_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter4_cond, out_f4_pq1, *p1q1);
+
+    // filter6 outputs
+    *p0q0 = vbsl_u8(filter6_cond, out_f6_pq0, *p0q0);
+    *p1q1 = vbsl_u8(filter6_cond, out_f6_pq1, *p1q1);
+  }
+}
+
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+                       const uint8_t limit, const uint8_t thresh) {
+  int32x2x2_t ps0_qs0, ps1_qs1;
+  int16x8_t filter_s16;
+  const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+  uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+  int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+  int8x8_t op0, oq0, op1, oq1;
+  int8x8_t pq_s0, pq_s1;
+  int8x8_t filter_s8, filter1_s8, filter2_s8;
+  int8x8_t hev_8x8;
+  const int8x8_t sign_mask = vdup_n_s8(0x80);
+  const int8x8_t val_4 = vdup_n_s8(4);
+  const int8x8_t val_3 = vdup_n_s8(3);
+
+  // Calculate filter mask
+  mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+  pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+  pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+  ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+  ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+  ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+  qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+  ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+  qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+  // hev_mask
+  temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+  temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+  hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+  // add outer taps if we have high edge variance
+  filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+  filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+  // inner taps
+  temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+  filter_s16 = vmovl_s8(filter_s8);
+  filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+  filter_s8 = vqmovn_s16(filter_s16);
+  filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+  filter1_s8 = vqadd_s8(filter_s8, val_4);
+  filter2_s8 = vqadd_s8(filter_s8, val_3);
+  filter1_s8 = vshr_n_s8(filter1_s8, 3);
+  filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+  oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+  op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+  filter_s8 = vrshr_n_s8(filter1_s8, 1);
+  filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+  oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+  op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+  *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+  *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
+void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                              const uint8_t *limit, const uint8_t *thresh) {
+  uint8x16_t row0, row1, row2, row3;
+  uint8x8_t pxp3, p6p2, p5p1, p4p0;
+  uint8x8_t q0q4, q1q5, q2q6, q3qy;
+  uint32x2x2_t p6q6_p2q2, p5q5_p1q1, p4q4_p0q0, pxqx_p3q3;
+  uint32x2_t pq_rev;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+  // row0: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row1: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row2: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  // row3: x p6 p5 p4 p3 p2 p1 p0 | q0 q1 q2 q3 q4 q5 q6 y
+  load_u8_16x4(src - 8, stride, &row0, &row1, &row2, &row3);
+
+  pxp3 = vget_low_u8(row0);
+  p6p2 = vget_low_u8(row1);
+  p5p1 = vget_low_u8(row2);
+  p4p0 = vget_low_u8(row3);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  q0q4 = vget_high_u8(row0);
+  q1q5 = vget_high_u8(row1);
+  q2q6 = vget_high_u8(row2);
+  q3qy = vget_high_u8(row3);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q3qy));
+  pxqx_p3q3 = vtrn_u32(vreinterpret_u32_u8(pxp3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q1q5));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5p1), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q0q4));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4p0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(q2q6));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6p2), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  p1q1 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  p2q2 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  p3q3 = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  p4q4 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  p5q5 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p6q6 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  pxqx_p3q3 = vtrn_u32(pxqx_p3q3.val[0], vreinterpret_u32_u8(p3q3));
+  p5q5_p1q1 = vtrn_u32(vreinterpret_u32_u8(p5q5), vreinterpret_u32_u8(p1q1));
+  p4q4_p0q0 = vtrn_u32(vreinterpret_u32_u8(p4q4), vreinterpret_u32_u8(p0q0));
+  p6q6_p2q2 = vtrn_u32(vreinterpret_u32_u8(p6q6), vreinterpret_u32_u8(p2q2));
+
+  pxqx_p3q3.val[1] = vrev64_u32(pxqx_p3q3.val[1]);
+  p5q5_p1q1.val[1] = vrev64_u32(p5q5_p1q1.val[1]);
+  p4q4_p0q0.val[1] = vrev64_u32(p4q4_p0q0.val[1]);
+  p6q6_p2q2.val[1] = vrev64_u32(p6q6_p2q2.val[1]);
+
+  q0q4 = vreinterpret_u8_u32(p4q4_p0q0.val[1]);
+  q1q5 = vreinterpret_u8_u32(p5q5_p1q1.val[1]);
+  q2q6 = vreinterpret_u8_u32(p6q6_p2q2.val[1]);
+  q3qy = vreinterpret_u8_u32(pxqx_p3q3.val[1]);
+  transpose_elems_inplace_u8_8x4(&q0q4, &q1q5, &q2q6, &q3qy);
+
+  pxp3 = vreinterpret_u8_u32(pxqx_p3q3.val[0]);
+  p6p2 = vreinterpret_u8_u32(p6q6_p2q2.val[0]);
+  p5p1 = vreinterpret_u8_u32(p5q5_p1q1.val[0]);
+  p4p0 = vreinterpret_u8_u32(p4q4_p0q0.val[0]);
+  transpose_elems_inplace_u8_8x4(&pxp3, &p6p2, &p5p1, &p4p0);
+
+  row0 = vcombine_u8(pxp3, q0q4);
+  row1 = vcombine_u8(p6p2, q1q5);
+  row2 = vcombine_u8(p5p1, q2q6);
+  row3 = vcombine_u8(p4p0, q3qy);
+
+  store_u8_16x4(src - 8, stride, row0, row1, row2, row3);
+}
+
+void aom_lpf_vertical_14_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_vertical_14_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_14_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_14_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                   const uint8_t *limit,
+                                   const uint8_t *thresh) {
+  aom_lpf_vertical_14_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                thresh);
+  aom_lpf_vertical_14_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                                thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, p3q3_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p3q0, p2q1, p1q2, p0q3;
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  // row0: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row1: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row2: p3 p2 p1 p0 | q0 q1 q2 q3
+  // row3: p3 p2 p1 p0 | q0 q1 q2 q3
+  load_u8_8x4(src - 4, stride, &p3q0, &p2q1, &p1q2, &p0q3);
+
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q3));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q3 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  p3q3_p0q0 = vtrn_u32(vreinterpret_u32_u8(p3q3), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0q3 = vreinterpret_u8_u32(vrev64_u32(p3q3_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  p3q0 = vreinterpret_u8_u32(p3q3_p0q0.val[0]);
+  transpose_elems_inplace_u8_8x4(&p3q0, &p2q1, &p1q2, &p0q3);
+
+  store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
+}
+
+void aom_lpf_vertical_8_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_8_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_8_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+  uint32x2_t pq_rev;
+  uint8x8_t pxq0, p2q1, p1q2, p0qy;
+  uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+  // row0: px p2 p1 p0 | q0 q1 q2 qy
+  // row1: px p2 p1 p0 | q0 q1 q2 qy
+  // row2: px p2 p1 p0 | q0 q1 q2 qy
+  // row3: px p2 p1 p0 | q0 q1 q2 qy
+  load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+  p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+  pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+  pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+  p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+  p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+  p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+  p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+  pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+  transpose_elems_inplace_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+  store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_vertical_6_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_6_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_6_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+  uint32x2_t pq_rev;
+  uint8x8_t p1p0, q0q1;
+  uint8x8_t p0q0, p1q1;
+
+  // row0: p1 p0 | q0 q1
+  // row1: p1 p0 | q0 q1
+  // row2: p1 p0 | q0 q1
+  // row3: p1 p0 | q0 q1
+  load_unaligned_u8_4x4(src - 2, stride, &p1p0, &q0q1);
+
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+  p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+  pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+  p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+  p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+  p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+  p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+  q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+  transpose_elems_inplace_u8_4x4(&p1p0, &q0q1);
+
+  store_u8x4_strided_x2(src - 2, 2 * stride, p1p0);
+  store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1);
+}
+
+void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  aom_lpf_vertical_4_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_neon(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_neon(uint8_t *s, int pitch, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh) {
+  aom_lpf_vertical_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                               thresh);
+  aom_lpf_vertical_4_dual_neon(s + 2 * MI_SIZE * pitch, pitch, blimit, limit,
+                               thresh, blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                                const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p6q6 = load_u8_4x2(src - 7 * stride, 13 * stride);
+  uint8x8_t p5q5 = load_u8_4x2(src - 6 * stride, 11 * stride);
+  uint8x8_t p4q4 = load_u8_4x2(src - 5 * stride, 9 * stride);
+  uint8x8_t p3q3 = load_u8_4x2(src - 4 * stride, 7 * stride);
+  uint8x8_t p2q2 = load_u8_4x2(src - 3 * stride, 5 * stride);
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+  lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+              *thresh);
+
+  store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+  store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+  store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2);
+  store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3);
+  store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4);
+  store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5);
+}
+
+void aom_lpf_horizontal_14_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_14_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_14_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_14_quad_neon(uint8_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh) {
+  aom_lpf_horizontal_14_dual_neon(s, pitch, blimit, limit, thresh, blimit,
+                                  limit, thresh);
+  aom_lpf_horizontal_14_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                  blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2, p3q3;
+
+  p3q3 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 4 * stride)));
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+  p3q3 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 3 * stride),
+                                           vreinterpret_u32_u8(p3q3), 1));
+
+  lpf_8_neon(&p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 4 * stride), vreinterpret_u32_u8(p3q3), 0);
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+  vst1_lane_u32((uint32_t *)(src + 3 * stride), vreinterpret_u32_u8(p3q3), 1);
+}
+
+void aom_lpf_horizontal_8_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_8_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_8_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_8_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p0q0, p1q1, p2q2;
+
+  p2q2 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 3 * stride)));
+  p1q1 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 2 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_dup_u32((uint32_t *)(src - 1 * stride)));
+  p0q0 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 0 * stride),
+                                           vreinterpret_u32_u8(p0q0), 1));
+  p1q1 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 1 * stride),
+                                           vreinterpret_u32_u8(p1q1), 1));
+  p2q2 = vreinterpret_u8_u32(vld1_lane_u32((uint32_t *)(src + 2 * stride),
+                                           vreinterpret_u32_u8(p2q2), 1));
+
+  lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  vst1_lane_u32((uint32_t *)(src - 3 * stride), vreinterpret_u32_u8(p2q2), 0);
+  vst1_lane_u32((uint32_t *)(src - 2 * stride), vreinterpret_u32_u8(p1q1), 0);
+  vst1_lane_u32((uint32_t *)(src - 1 * stride), vreinterpret_u32_u8(p0q0), 0);
+  vst1_lane_u32((uint32_t *)(src + 0 * stride), vreinterpret_u32_u8(p0q0), 1);
+  vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
+  vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
+}
+
+void aom_lpf_horizontal_6_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_6_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_6_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_6_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+                               const uint8_t *limit, const uint8_t *thresh) {
+  uint8x8_t p1q1 = load_u8_4x2(src - 2 * stride, 3 * stride);
+  uint8x8_t p0q0 = load_u8_4x2(src - 1 * stride, 1 * stride);
+
+  lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+  store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0);
+  store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1);
+}
+
+void aom_lpf_horizontal_4_dual_neon(
+    uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_neon(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_neon(s + 4, pitch, blimit1, limit1, thresh1);
+}
+
+// TODO(any): Rewrite in NEON (similar to quad SSE2 functions) for better speed
+// up.
+void aom_lpf_horizontal_4_quad_neon(uint8_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh) {
+  aom_lpf_horizontal_4_dual_neon(s, pitch, blimit, limit, thresh, blimit, limit,
+                                 thresh);
+  aom_lpf_horizontal_4_dual_neon(s + 2 * MI_SIZE, pitch, blimit, limit, thresh,
+                                 blimit, limit, thresh);
+}
diff --git a/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
new file mode 100644
index 0000000000..8f65b805ec
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad4d_neon.c
@@ -0,0 +1,562 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+                                              const uint8x16_t s0,
+                                              const uint8x16_t a0,
+                                              const uint8x16_t b0,
+                                              const uint8x16_t m0) {
+  uint8x16_t m0_inv = vsubq_u8(vdupq_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+  uint16x8_t blend_u16_lo = vmull_u8(vget_low_u8(m0), vget_low_u8(a0));
+  uint16x8_t blend_u16_hi = vmull_u8(vget_high_u8(m0), vget_high_u8(a0));
+  blend_u16_lo = vmlal_u8(blend_u16_lo, vget_low_u8(m0_inv), vget_low_u8(b0));
+  blend_u16_hi = vmlal_u8(blend_u16_hi, vget_high_u8(m0_inv), vget_high_u8(b0));
+
+  uint8x8_t blend_u8_lo = vrshrn_n_u16(blend_u16_lo, AOM_BLEND_A64_ROUND_BITS);
+  uint8x8_t blend_u8_hi = vrshrn_n_u16(blend_u16_hi, AOM_BLEND_A64_ROUND_BITS);
+  uint8x16_t blend_u8 = vcombine_u8(blend_u8_lo, blend_u8_hi);
+  return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE void masked_inv_sadwxhx4d_large_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  int h_limit = height > h_overflow ? h_overflow : height;
+
+  int ref_offset = 0;
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      int j = 0;
+      do {
+        uint8x16_t s0 = vld1q_u8(src + j);
+        uint8x16_t p0 = vld1q_u8(second_pred + j);
+        uint8x16_t m0 = vld1q_u8(mask + j);
+        sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+                                         vld1q_u8(ref[0] + ref_offset + j), m0);
+        sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+                                         vld1q_u8(ref[1] + ref_offset + j), m0);
+        sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+                                         vld1q_u8(ref[2] + ref_offset + j), m0);
+        sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+                                         vld1q_u8(ref[3] + ref_offset + j), m0);
+
+        uint8x16_t s1 = vld1q_u8(src + j + 16);
+        uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+        uint8x16_t m1 = vld1q_u8(mask + j + 16);
+        sum_hi[0] = masked_sad_16x1_neon(
+            sum_hi[0], s1, p1, vld1q_u8(ref[0] + ref_offset + j + 16), m1);
+        sum_hi[1] = masked_sad_16x1_neon(
+            sum_hi[1], s1, p1, vld1q_u8(ref[1] + ref_offset + j + 16), m1);
+        sum_hi[2] = masked_sad_16x1_neon(
+            sum_hi[2], s1, p1, vld1q_u8(ref[2] + ref_offset + j + 16), m1);
+        sum_hi[3] = masked_sad_16x1_neon(
+            sum_hi[3], s1, p1, vld1q_u8(ref[3] + ref_offset + j + 16), m1);
+
+        j += 32;
+      } while (j < width);
+
+      src += src_stride;
+      ref_offset += ref_stride;
+      second_pred += width;
+      mask += mask_stride;
+    } while (++i < h_limit);
+
+    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+    h_limit += h_overflow;
+  } while (i < height);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_inv_sad128xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+                                  mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_inv_sad64xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  masked_inv_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+                                  mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_sadwxhx4d_large_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int width, int height, int h_overflow) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  int h_limit = height > h_overflow ? h_overflow : height;
+
+  int ref_offset = 0;
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      int j = 0;
+      do {
+        uint8x16_t s0 = vld1q_u8(src + j);
+        uint8x16_t p0 = vld1q_u8(second_pred + j);
+        uint8x16_t m0 = vld1q_u8(mask + j);
+        sum_lo[0] = masked_sad_16x1_neon(
+            sum_lo[0], s0, vld1q_u8(ref[0] + ref_offset + j), p0, m0);
+        sum_lo[1] = masked_sad_16x1_neon(
+            sum_lo[1], s0, vld1q_u8(ref[1] + ref_offset + j), p0, m0);
+        sum_lo[2] = masked_sad_16x1_neon(
+            sum_lo[2], s0, vld1q_u8(ref[2] + ref_offset + j), p0, m0);
+        sum_lo[3] = masked_sad_16x1_neon(
+            sum_lo[3], s0, vld1q_u8(ref[3] + ref_offset + j), p0, m0);
+
+        uint8x16_t s1 = vld1q_u8(src + j + 16);
+        uint8x16_t p1 = vld1q_u8(second_pred + j + 16);
+        uint8x16_t m1 = vld1q_u8(mask + j + 16);
+        sum_hi[0] = masked_sad_16x1_neon(
+            sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + j + 16), p1, m1);
+        sum_hi[1] = masked_sad_16x1_neon(
+            sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + j + 16), p1, m1);
+        sum_hi[2] = masked_sad_16x1_neon(
+            sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + j + 16), p1, m1);
+        sum_hi[3] = masked_sad_16x1_neon(
+            sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + j + 16), p1, m1);
+
+        j += 32;
+      } while (j < width);
+
+      src += src_stride;
+      ref_offset += ref_stride;
+      second_pred += width;
+      mask += mask_stride;
+    } while (++i < h_limit);
+
+    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+    h_limit += h_overflow;
+  } while (i < height);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void masked_sad128xhx4d_neon(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride,
+                                           const uint8_t *second_pred,
+                                           const uint8_t *mask, int mask_stride,
+                                           uint32_t res[4], int h) {
+  masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+                              mask, mask_stride, res, 128, h, 32);
+}
+
+static INLINE void masked_sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                          const uint8_t *const ref[4],
+                                          int ref_stride,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          uint32_t res[4], int h) {
+  masked_sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, second_pred,
+                              mask, mask_stride, res, 64, h, 64);
+}
+
+static INLINE void masked_inv_sad32xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t m0 = vld1q_u8(mask);
+    sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0, p0,
+                                     vld1q_u8(ref[0] + ref_offset), m0);
+    sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0, p0,
+                                     vld1q_u8(ref[1] + ref_offset), m0);
+    sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0, p0,
+                                     vld1q_u8(ref[2] + ref_offset), m0);
+    sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0, p0,
+                                     vld1q_u8(ref[3] + ref_offset), m0);
+
+    uint8x16_t s1 = vld1q_u8(src + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t m1 = vld1q_u8(mask + 16);
+    sum_hi[0] = masked_sad_16x1_neon(sum_hi[0], s1, p1,
+                                     vld1q_u8(ref[0] + ref_offset + 16), m1);
+    sum_hi[1] = masked_sad_16x1_neon(sum_hi[1], s1, p1,
+                                     vld1q_u8(ref[1] + ref_offset + 16), m1);
+    sum_hi[2] = masked_sad_16x1_neon(sum_hi[2], s1, p1,
+                                     vld1q_u8(ref[2] + ref_offset + 16), m1);
+    sum_hi[3] = masked_sad_16x1_neon(sum_hi[3], s1, p1,
+                                     vld1q_u8(ref[3] + ref_offset + 16), m1);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 32;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                          const uint8_t *const ref[4],
+                                          int ref_stride,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t m0 = vld1q_u8(mask);
+    sum_lo[0] = masked_sad_16x1_neon(sum_lo[0], s0,
+                                     vld1q_u8(ref[0] + ref_offset), p0, m0);
+    sum_lo[1] = masked_sad_16x1_neon(sum_lo[1], s0,
+                                     vld1q_u8(ref[1] + ref_offset), p0, m0);
+    sum_lo[2] = masked_sad_16x1_neon(sum_lo[2], s0,
+                                     vld1q_u8(ref[2] + ref_offset), p0, m0);
+    sum_lo[3] = masked_sad_16x1_neon(sum_lo[3], s0,
+                                     vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+    uint8x16_t s1 = vld1q_u8(src + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t m1 = vld1q_u8(mask + 16);
+    sum_hi[0] = masked_sad_16x1_neon(
+        sum_hi[0], s1, vld1q_u8(ref[0] + ref_offset + 16), p1, m1);
+    sum_hi[1] = masked_sad_16x1_neon(
+        sum_hi[1], s1, vld1q_u8(ref[1] + ref_offset + 16), p1, m1);
+    sum_hi[2] = masked_sad_16x1_neon(
+        sum_hi[2], s1, vld1q_u8(ref[2] + ref_offset + 16), p1, m1);
+    sum_hi[3] = masked_sad_16x1_neon(
+        sum_hi[3], s1, vld1q_u8(ref[3] + ref_offset + 16), p1, m1);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 32;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void masked_inv_sad16xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                            vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t m0 = vld1q_u8(mask);
+    sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0, p0,
+                                      vld1q_u8(ref[0] + ref_offset), m0);
+    sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0, p0,
+                                      vld1q_u8(ref[1] + ref_offset), m0);
+    sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0, p0,
+                                      vld1q_u8(ref[2] + ref_offset), m0);
+    sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0, p0,
+                                      vld1q_u8(ref[3] + ref_offset), m0);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 16;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void masked_sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                          const uint8_t *const ref[4],
+                                          int ref_stride,
+                                          const uint8_t *second_pred,
+                                          const uint8_t *mask, int mask_stride,
+                                          uint32_t res[4], int h) {
+  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                            vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t m0 = vld1q_u8(mask);
+    sum_u16[0] = masked_sad_16x1_neon(sum_u16[0], s0,
+                                      vld1q_u8(ref[0] + ref_offset), p0, m0);
+    sum_u16[1] = masked_sad_16x1_neon(sum_u16[1], s0,
+                                      vld1q_u8(ref[1] + ref_offset), p0, m0);
+    sum_u16[2] = masked_sad_16x1_neon(sum_u16[2], s0,
+                                      vld1q_u8(ref[2] + ref_offset), p0, m0);
+    sum_u16[3] = masked_sad_16x1_neon(sum_u16[3], s0,
+                                      vld1q_u8(ref[3] + ref_offset), p0, m0);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 16;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE uint16x8_t masked_sad_8x1_neon(uint16x8_t sad, const uint8x8_t s0,
+                                             const uint8x8_t a0,
+                                             const uint8x8_t b0,
+                                             const uint8x8_t m0) {
+  uint8x8_t m0_inv = vsub_u8(vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA), m0);
+  uint16x8_t blend_u16 = vmull_u8(m0, a0);
+  blend_u16 = vmlal_u8(blend_u16, m0_inv, b0);
+
+  uint8x8_t blend_u8 = vrshrn_n_u16(blend_u16, AOM_BLEND_A64_ROUND_BITS);
+  return vabal_u8(sad, blend_u8, s0);
+}
+
+static INLINE void masked_inv_sad8xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x8_t s0 = vld1_u8(src);
+    uint8x8_t p0 = vld1_u8(second_pred);
+    uint8x8_t m0 = vld1_u8(mask);
+    sum[0] =
+        masked_sad_8x1_neon(sum[0], s0, p0, vld1_u8(ref[0] + ref_offset), m0);
+    sum[1] =
+        masked_sad_8x1_neon(sum[1], s0, p0, vld1_u8(ref[1] + ref_offset), m0);
+    sum[2] =
+        masked_sad_8x1_neon(sum[2], s0, p0, vld1_u8(ref[2] + ref_offset), m0);
+    sum[3] =
+        masked_sad_8x1_neon(sum[3], s0, p0, vld1_u8(ref[3] + ref_offset), m0);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 8;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                         const uint8_t *const ref[4],
+                                         int ref_stride,
+                                         const uint8_t *second_pred,
+                                         const uint8_t *mask, int mask_stride,
+                                         uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    uint8x8_t s0 = vld1_u8(src);
+    uint8x8_t p0 = vld1_u8(second_pred);
+    uint8x8_t m0 = vld1_u8(mask);
+
+    sum[0] =
+        masked_sad_8x1_neon(sum[0], s0, vld1_u8(ref[0] + ref_offset), p0, m0);
+    sum[1] =
+        masked_sad_8x1_neon(sum[1], s0, vld1_u8(ref[1] + ref_offset), p0, m0);
+    sum[2] =
+        masked_sad_8x1_neon(sum[2], s0, vld1_u8(ref[2] + ref_offset), p0, m0);
+    sum[3] =
+        masked_sad_8x1_neon(sum[3], s0, vld1_u8(ref[3] + ref_offset), p0, m0);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+    second_pred += 8;
+    mask += mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_inv_sad4xhx4d_neon(
+    const uint8_t *src, int src_stride, const uint8_t *const ref[4],
+    int ref_stride, const uint8_t *second_pred, const uint8_t *mask,
+    int mask_stride, uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+    uint8x8_t p0 = vld1_u8(second_pred);
+    uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+    sum[0] = masked_sad_8x1_neon(sum[0], s, p0, r0, m0);
+    sum[1] = masked_sad_8x1_neon(sum[1], s, p0, r1, m0);
+    sum[2] = masked_sad_8x1_neon(sum[2], s, p0, r2, m0);
+    sum[3] = masked_sad_8x1_neon(sum[3], s, p0, r3, m0);
+
+    src += 2 * src_stride;
+    ref_offset += 2 * ref_stride;
+    second_pred += 2 * 4;
+    mask += 2 * mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void masked_sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                         const uint8_t *const ref[4],
+                                         int ref_stride,
+                                         const uint8_t *second_pred,
+                                         const uint8_t *mask, int mask_stride,
+                                         uint32_t res[4], int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src, src_stride);
+    uint8x8_t r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+    uint8x8_t r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+    uint8x8_t r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+    uint8x8_t r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+    uint8x8_t p0 = vld1_u8(second_pred);
+    uint8x8_t m0 = load_unaligned_u8(mask, mask_stride);
+
+    sum[0] = masked_sad_8x1_neon(sum[0], s, r0, p0, m0);
+    sum[1] = masked_sad_8x1_neon(sum[1], s, r1, p0, m0);
+    sum[2] = masked_sad_8x1_neon(sum[2], s, r2, p0, m0);
+    sum[3] = masked_sad_8x1_neon(sum[3], s, r3, p0, m0);
+
+    src += 2 * src_stride;
+    ref_offset += 2 * ref_stride;
+    second_pred += 2 * 4;
+    mask += 2 * mask_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define MASKED_SAD4D_WXH_NEON(w, h)                                            \
+  void aom_masked_sad##w##x##h##x4d_neon(                                      \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, uint32_t res[4]) {                      \
+    if (invert_mask) {                                                         \
+      masked_inv_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride,          \
+                                    second_pred, msk, msk_stride, res, h);     \
+    } else {                                                                   \
+      masked_sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, second_pred, \
+                                msk, msk_stride, res, h);                      \
+    }                                                                          \
+  }
+
+MASKED_SAD4D_WXH_NEON(4, 8)
+MASKED_SAD4D_WXH_NEON(4, 4)
+
+MASKED_SAD4D_WXH_NEON(8, 16)
+MASKED_SAD4D_WXH_NEON(8, 8)
+MASKED_SAD4D_WXH_NEON(8, 4)
+
+MASKED_SAD4D_WXH_NEON(16, 32)
+MASKED_SAD4D_WXH_NEON(16, 16)
+MASKED_SAD4D_WXH_NEON(16, 8)
+
+MASKED_SAD4D_WXH_NEON(32, 64)
+MASKED_SAD4D_WXH_NEON(32, 32)
+MASKED_SAD4D_WXH_NEON(32, 16)
+
+MASKED_SAD4D_WXH_NEON(64, 128)
+MASKED_SAD4D_WXH_NEON(64, 64)
+MASKED_SAD4D_WXH_NEON(64, 32)
+
+MASKED_SAD4D_WXH_NEON(128, 128)
+MASKED_SAD4D_WXH_NEON(128, 64)
+
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD4D_WXH_NEON(4, 16)
+MASKED_SAD4D_WXH_NEON(16, 4)
+MASKED_SAD4D_WXH_NEON(8, 32)
+MASKED_SAD4D_WXH_NEON(32, 8)
+MASKED_SAD4D_WXH_NEON(16, 64)
+MASKED_SAD4D_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/masked_sad_neon.c b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
new file mode 100644
index 0000000000..9d263105e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/masked_sad_neon.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/blend_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_dsp/blend.h"
+
+static INLINE uint16x8_t masked_sad_16x1_neon(uint16x8_t sad,
+                                              const uint8_t *src,
+                                              const uint8_t *a,
+                                              const uint8_t *b,
+                                              const uint8_t *m) {
+  uint8x16_t m0 = vld1q_u8(m);
+  uint8x16_t a0 = vld1q_u8(a);
+  uint8x16_t b0 = vld1q_u8(b);
+  uint8x16_t s0 = vld1q_u8(src);
+
+  uint8x16_t blend_u8 = alpha_blend_a64_u8x16(m0, a0, b0);
+
+  return vpadalq_u8(sad, vabdq_u8(blend_u8, s0));
+}
+
+static INLINE unsigned masked_sad_128xh_neon(const uint8_t *src, int src_stride,
+                                             const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             const uint8_t *m, int m_stride,
+                                             int height) {
+  // Eight accumulator vectors are required to avoid overflow in the 128x128
+  // case.
+  assert(height <= 128);
+  uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                       vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                       vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+    sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+    sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+    sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+    sad[4] = masked_sad_16x1_neon(sad[4], &src[64], &a[64], &b[64], &m[64]);
+    sad[5] = masked_sad_16x1_neon(sad[5], &src[80], &a[80], &b[80], &m[80]);
+    sad[6] = masked_sad_16x1_neon(sad[6], &src[96], &a[96], &b[96], &m[96]);
+    sad[7] = masked_sad_16x1_neon(sad[7], &src[112], &a[112], &b[112], &m[112]);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+    height--;
+  } while (height != 0);
+
+  return horizontal_long_add_u16x8(sad[0], sad[1]) +
+         horizontal_long_add_u16x8(sad[2], sad[3]) +
+         horizontal_long_add_u16x8(sad[4], sad[5]) +
+         horizontal_long_add_u16x8(sad[6], sad[7]);
+}
+
+static INLINE unsigned masked_sad_64xh_neon(const uint8_t *src, int src_stride,
+                                            const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride,
+                                            int height) {
+  // Four accumulator vectors are required to avoid overflow in the 64x128 case.
+  assert(height <= 128);
+  uint16x8_t sad[] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                       vdupq_n_u16(0) };
+
+  do {
+    sad[0] = masked_sad_16x1_neon(sad[0], &src[0], &a[0], &b[0], &m[0]);
+    sad[1] = masked_sad_16x1_neon(sad[1], &src[16], &a[16], &b[16], &m[16]);
+    sad[2] = masked_sad_16x1_neon(sad[2], &src[32], &a[32], &b[32], &m[32]);
+    sad[3] = masked_sad_16x1_neon(sad[3], &src[48], &a[48], &b[48], &m[48]);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+    height--;
+  } while (height != 0);
+
+  return horizontal_long_add_u16x8(sad[0], sad[1]) +
+         horizontal_long_add_u16x8(sad[2], sad[3]);
+}
+
+static INLINE unsigned masked_sad_32xh_neon(const uint8_t *src, int src_stride,
+                                            const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride,
+                                            int height) {
+  // We could use a single accumulator up to height=64 without overflow.
+  assert(height <= 64);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_16x1_neon(sad, &src[0], &a[0], &b[0], &m[0]);
+    sad = masked_sad_16x1_neon(sad, &src[16], &a[16], &b[16], &m[16]);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+    height--;
+  } while (height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_16xh_neon(const uint8_t *src, int src_stride,
+                                            const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride,
+                                            int height) {
+  // We could use a single accumulator up to height=128 without overflow.
+  assert(height <= 128);
+  uint16x8_t sad = vdupq_n_u16(0);
+
+  do {
+    sad = masked_sad_16x1_neon(sad, src, a, b, m);
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+    height--;
+  } while (height != 0);
+
+  return horizontal_add_u16x8(sad);
+}
+
+static INLINE unsigned masked_sad_8xh_neon(const uint8_t *src, int src_stride,
+                                           const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           const uint8_t *m, int m_stride,
+                                           int height) {
+  // We could use a single accumulator up to height=128 without overflow.
+  assert(height <= 128);
+  uint16x4_t sad = vdup_n_u16(0);
+
+  do {
+    uint8x8_t m0 = vld1_u8(m);
+    uint8x8_t a0 = vld1_u8(a);
+    uint8x8_t b0 = vld1_u8(b);
+    uint8x8_t s0 = vld1_u8(src);
+
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+    sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+    height--;
+  } while (height != 0);
+
+  return horizontal_add_u16x4(sad);
+}
+
+static INLINE unsigned masked_sad_4xh_neon(const uint8_t *src, int src_stride,
+                                           const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           const uint8_t *m, int m_stride,
+                                           int height) {
+  // Process two rows per loop iteration.
+  assert(height % 2 == 0);
+
+  // We could use a single accumulator up to height=256 without overflow.
+  assert(height <= 256);
+  uint16x4_t sad = vdup_n_u16(0);
+
+  do {
+    uint8x8_t m0 = load_unaligned_u8(m, m_stride);
+    uint8x8_t a0 = load_unaligned_u8(a, a_stride);
+    uint8x8_t b0 = load_unaligned_u8(b, b_stride);
+    uint8x8_t s0 = load_unaligned_u8(src, src_stride);
+
+    uint8x8_t blend_u8 = alpha_blend_a64_u8x8(m0, a0, b0);
+
+    sad = vpadal_u8(sad, vabd_u8(blend_u8, s0));
+
+    src += 2 * src_stride;
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+    m += 2 * m_stride;
+    height -= 2;
+  } while (height != 0);
+
+  return horizontal_add_u16x4(sad);
+}
+
+#define MASKED_SAD_WXH_NEON(width, height)                                    \
+  unsigned aom_masked_sad##width##x##height##_neon(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_##width##xh_neon(src, src_stride, ref, ref_stride,    \
+                                         second_pred, width, msk, msk_stride, \
+                                         height);                             \
+    else                                                                      \
+      return masked_sad_##width##xh_neon(src, src_stride, second_pred, width, \
+                                         ref, ref_stride, msk, msk_stride,    \
+                                         height);                             \
+  }
+
+MASKED_SAD_WXH_NEON(4, 4)
+MASKED_SAD_WXH_NEON(4, 8)
+MASKED_SAD_WXH_NEON(8, 4)
+MASKED_SAD_WXH_NEON(8, 8)
+MASKED_SAD_WXH_NEON(8, 16)
+MASKED_SAD_WXH_NEON(16, 8)
+MASKED_SAD_WXH_NEON(16, 16)
+MASKED_SAD_WXH_NEON(16, 32)
+MASKED_SAD_WXH_NEON(32, 16)
+MASKED_SAD_WXH_NEON(32, 32)
+MASKED_SAD_WXH_NEON(32, 64)
+MASKED_SAD_WXH_NEON(64, 32)
+MASKED_SAD_WXH_NEON(64, 64)
+MASKED_SAD_WXH_NEON(64, 128)
+MASKED_SAD_WXH_NEON(128, 64)
+MASKED_SAD_WXH_NEON(128, 128)
+#if !CONFIG_REALTIME_ONLY
+MASKED_SAD_WXH_NEON(4, 16)
+MASKED_SAD_WXH_NEON(16, 4)
+MASKED_SAD_WXH_NEON(8, 32)
+MASKED_SAD_WXH_NEON(32, 8)
+MASKED_SAD_WXH_NEON(16, 64)
+MASKED_SAD_WXH_NEON(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/arm/mem_neon.h b/third_party/aom/aom_dsp/arm/mem_neon.h
new file mode 100644
index 0000000000..52c7a34e3e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/mem_neon.h
@@ -0,0 +1,1253 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_MEM_NEON_H_
+#define AOM_AOM_DSP_ARM_MEM_NEON_H_
+
+#include <arm_neon.h>
+#include <string.h>
+#include "aom_dsp/aom_dsp_common.h"
+
+// Support for xN Neon intrinsics is lacking in some compilers.
+#if defined(__arm__) || defined(_M_ARM)
+#define ARM_32_BIT
+#endif
+
+// DEFICIENT_CLANG_32_BIT includes clang-cl.
+#if defined(__clang__) && defined(ARM_32_BIT) && \
+    (__clang_major__ <= 6 || (defined(__ANDROID__) && __clang_major__ <= 7))
+#define DEFICIENT_CLANG_32_BIT  // This includes clang-cl.
+#endif
+
+#if defined(__GNUC__) && !defined(__clang__) && defined(ARM_32_BIT)
+#define GCC_32_BIT
+#endif
+
+#if defined(DEFICIENT_CLANG_32_BIT) || defined(GCC_32_BIT)
+
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x2_t vld1q_u16_x2(const uint16_t *ptr) {
+  uint16x8x2_t res = { { vld1q_u16(ptr + 0), vld1q_u16(ptr + 8) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+
+#elif defined(__GNUC__) && !defined(__clang__)  // GCC 64-bit.
+#if __GNUC__ < 8
+
+static INLINE uint8x16x2_t vld1q_u8_x2(const uint8_t *ptr) {
+  uint8x16x2_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16) } };
+  return res;
+}
+
+static INLINE uint16x8x4_t vld1q_u16_x4(const uint16_t *ptr) {
+  uint16x8x4_t res = { { vld1q_u16(ptr + 0 * 8), vld1q_u16(ptr + 1 * 8),
+                         vld1q_u16(ptr + 2 * 8), vld1q_u16(ptr + 3 * 8) } };
+  return res;
+}
+#endif  // __GNUC__ < 8
+
+#if __GNUC__ < 9
+static INLINE uint8x16x3_t vld1q_u8_x3(const uint8_t *ptr) {
+  uint8x16x3_t res = { { vld1q_u8(ptr + 0 * 16), vld1q_u8(ptr + 1 * 16),
+                         vld1q_u8(ptr + 2 * 16) } };
+  return res;
+}
+#endif  // __GNUC__ < 9
+#endif  // defined(__GNUC__) && !defined(__clang__)
+
+static INLINE void store_u8_8x2(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+}
+
+static INLINE uint8x16_t load_u8_8x2(const uint8_t *s, ptrdiff_t p) {
+  return vcombine_u8(vld1_u8(s), vld1_u8(s + p));
+}
+
+// Load four bytes into the low half of a uint8x8_t, zero the upper half.
+static INLINE uint8x8_t load_u8_4x1(const uint8_t *p) {
+  uint8x8_t ret = vdup_n_u8(0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  return ret;
+}
+
+static INLINE uint8x8_t load_u8_4x2(const uint8_t *p, int stride) {
+  uint8x8_t ret = vdup_n_u8(0);
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 0));
+  p += stride;
+  ret = vreinterpret_u8_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u8(ret), 1));
+  return ret;
+}
+
+static INLINE uint16x4_t load_u16_2x2(const uint16_t *p, int stride) {
+  uint16x4_t ret = vdup_n_u16(0);
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 0));
+  p += stride;
+  ret = vreinterpret_u16_u32(
+      vld1_lane_u32((const uint32_t *)p, vreinterpret_u32_u16(ret), 1));
+  return ret;
+}
+
+static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6, uint8x8_t *const s7) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x7(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4, uint8x8_t *const s5,
+                               uint8x8_t *const s6) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+}
+
+static INLINE void load_u8_8x4(const uint8_t *s, const ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+}
+
+static INLINE void load_u16_4x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u16_4x7(const uint16_t *s, ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3,
+                                uint16x4_t *const s4, uint16x4_t *const s5,
+                                uint16x4_t *const s6) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+  *s5 = vld1_u16(s);
+  s += p;
+  *s6 = vld1_u16(s);
+}
+
+static INLINE void load_s16_8x2(const int16_t *s, const ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x2(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_8x4(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x12(const int16_t *s, ptrdiff_t p,
+                                 int16x4_t *const s0, int16x4_t *const s1,
+                                 int16x4_t *const s2, int16x4_t *const s3,
+                                 int16x4_t *const s4, int16x4_t *const s5,
+                                 int16x4_t *const s6, int16x4_t *const s7,
+                                 int16x4_t *const s8, int16x4_t *const s9,
+                                 int16x4_t *const s10, int16x4_t *const s11) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+  s += p;
+  *s8 = vld1_s16(s);
+  s += p;
+  *s9 = vld1_s16(s);
+  s += p;
+  *s10 = vld1_s16(s);
+  s += p;
+  *s11 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x11(const int16_t *s, ptrdiff_t p,
+                                 int16x4_t *const s0, int16x4_t *const s1,
+                                 int16x4_t *const s2, int16x4_t *const s3,
+                                 int16x4_t *const s4, int16x4_t *const s5,
+                                 int16x4_t *const s6, int16x4_t *const s7,
+                                 int16x4_t *const s8, int16x4_t *const s9,
+                                 int16x4_t *const s10) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+  s += p;
+  *s8 = vld1_s16(s);
+  s += p;
+  *s9 = vld1_s16(s);
+  s += p;
+  *s10 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x11(const uint16_t *s, ptrdiff_t p,
+                                 uint16x4_t *const s0, uint16x4_t *const s1,
+                                 uint16x4_t *const s2, uint16x4_t *const s3,
+                                 uint16x4_t *const s4, uint16x4_t *const s5,
+                                 uint16x4_t *const s6, uint16x4_t *const s7,
+                                 uint16x4_t *const s8, uint16x4_t *const s9,
+                                 uint16x4_t *const s10) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+  *s5 = vld1_u16(s);
+  s += p;
+  *s6 = vld1_u16(s);
+  s += p;
+  *s7 = vld1_u16(s);
+  s += p;
+  *s8 = vld1_u16(s);
+  s += p;
+  *s9 = vld1_u16(s);
+  s += p;
+  *s10 = vld1_u16(s);
+}
+
+static INLINE void load_s16_4x8(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6, int16x4_t *const s7) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+  s += p;
+  *s7 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x7(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5,
+                                int16x4_t *const s6) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+  s += p;
+  *s6 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x6(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4, int16x4_t *const s5) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+  s += p;
+  *s5 = vld1_s16(s);
+}
+
+static INLINE void load_s16_4x5(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3,
+                                int16x4_t *const s4) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+  s += p;
+  *s4 = vld1_s16(s);
+}
+
+static INLINE void load_u16_4x5(const uint16_t *s, const ptrdiff_t p,
+                                uint16x4_t *const s0, uint16x4_t *const s1,
+                                uint16x4_t *const s2, uint16x4_t *const s3,
+                                uint16x4_t *const s4) {
+  *s0 = vld1_u16(s);
+  s += p;
+  *s1 = vld1_u16(s);
+  s += p;
+  *s2 = vld1_u16(s);
+  s += p;
+  *s3 = vld1_u16(s);
+  s += p;
+  *s4 = vld1_u16(s);
+  s += p;
+}
+
+static INLINE void load_u8_8x5(const uint8_t *s, ptrdiff_t p,
+                               uint8x8_t *const s0, uint8x8_t *const s1,
+                               uint8x8_t *const s2, uint8x8_t *const s3,
+                               uint8x8_t *const s4) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+}
+
+static INLINE void load_u16_8x5(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3,
+                                uint16x8_t *const s4) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+}
+
+static INLINE void load_s16_4x4(const int16_t *s, ptrdiff_t p,
+                                int16x4_t *const s0, int16x4_t *const s1,
+                                int16x4_t *const s2, int16x4_t *const s3) {
+  *s0 = vld1_s16(s);
+  s += p;
+  *s1 = vld1_s16(s);
+  s += p;
+  *s2 = vld1_s16(s);
+  s += p;
+  *s3 = vld1_s16(s);
+}
+
+static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3, const uint8x8_t s4,
+                                const uint8x8_t s5, const uint8x8_t s6,
+                                const uint8x8_t s7) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+  s += p;
+  vst1_u8(s, s4);
+  s += p;
+  vst1_u8(s, s5);
+  s += p;
+  vst1_u8(s, s6);
+  s += p;
+  vst1_u8(s, s7);
+}
+
+static INLINE void store_u8_8x4(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
+                                const uint8x8_t s1, const uint8x8_t s2,
+                                const uint8x8_t s3) {
+  vst1_u8(s, s0);
+  s += p;
+  vst1_u8(s, s1);
+  s += p;
+  vst1_u8(s, s2);
+  s += p;
+  vst1_u8(s, s3);
+}
+
+static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0,
+                                 const uint8x16_t s1, const uint8x16_t s2,
+                                 const uint8x16_t s3) {
+  vst1q_u8(s, s0);
+  s += p;
+  vst1q_u8(s, s1);
+  s += p;
+  vst1q_u8(s, s2);
+  s += p;
+  vst1q_u8(s, s3);
+}
+
+static INLINE void store_u16_8x8(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3,
+                                 const uint16x8_t s4, const uint16x8_t s5,
+                                 const uint16x8_t s6, const uint16x8_t s7) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+  s += dst_stride;
+  vst1q_u16(s, s4);
+  s += dst_stride;
+  vst1q_u16(s, s5);
+  s += dst_stride;
+  vst1q_u16(s, s6);
+  s += dst_stride;
+  vst1q_u16(s, s7);
+}
+
+static INLINE void store_u16_4x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x4_t s0, const uint16x4_t s1,
+                                 const uint16x4_t s2, const uint16x4_t s3) {
+  vst1_u16(s, s0);
+  s += dst_stride;
+  vst1_u16(s, s1);
+  s += dst_stride;
+  vst1_u16(s, s2);
+  s += dst_stride;
+  vst1_u16(s, s3);
+}
+
+static INLINE void store_u16_8x2(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+}
+
+static INLINE void store_u16_8x4(uint16_t *s, ptrdiff_t dst_stride,
+                                 const uint16x8_t s0, const uint16x8_t s1,
+                                 const uint16x8_t s2, const uint16x8_t s3) {
+  vst1q_u16(s, s0);
+  s += dst_stride;
+  vst1q_u16(s, s1);
+  s += dst_stride;
+  vst1q_u16(s, s2);
+  s += dst_stride;
+  vst1q_u16(s, s3);
+}
+
+static INLINE void store_s16_8x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3,
+                                 const int16x8_t s4, const int16x8_t s5,
+                                 const int16x8_t s6, const int16x8_t s7) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+  s += dst_stride;
+  vst1q_s16(s, s4);
+  s += dst_stride;
+  vst1q_s16(s, s5);
+  s += dst_stride;
+  vst1q_s16(s, s6);
+  s += dst_stride;
+  vst1q_s16(s, s7);
+}
+
+static INLINE void store_s16_4x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+}
+
+static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x8_t s0, const int16x8_t s1,
+                                 const int16x8_t s2, const int16x8_t s3) {
+  vst1q_s16(s, s0);
+  s += dst_stride;
+  vst1q_s16(s, s1);
+  s += dst_stride;
+  vst1q_s16(s, s2);
+  s += dst_stride;
+  vst1q_s16(s, s3);
+}
+
+static INLINE void load_u8_8x11(const uint8_t *s, ptrdiff_t p,
+                                uint8x8_t *const s0, uint8x8_t *const s1,
+                                uint8x8_t *const s2, uint8x8_t *const s3,
+                                uint8x8_t *const s4, uint8x8_t *const s5,
+                                uint8x8_t *const s6, uint8x8_t *const s7,
+                                uint8x8_t *const s8, uint8x8_t *const s9,
+                                uint8x8_t *const s10) {
+  *s0 = vld1_u8(s);
+  s += p;
+  *s1 = vld1_u8(s);
+  s += p;
+  *s2 = vld1_u8(s);
+  s += p;
+  *s3 = vld1_u8(s);
+  s += p;
+  *s4 = vld1_u8(s);
+  s += p;
+  *s5 = vld1_u8(s);
+  s += p;
+  *s6 = vld1_u8(s);
+  s += p;
+  *s7 = vld1_u8(s);
+  s += p;
+  *s8 = vld1_u8(s);
+  s += p;
+  *s9 = vld1_u8(s);
+  s += p;
+  *s10 = vld1_u8(s);
+}
+
+static INLINE void load_s16_8x10(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x11(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9,
+                                 int16x8_t *const s10) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x12(const int16_t *s, ptrdiff_t p,
+                                 int16x8_t *const s0, int16x8_t *const s1,
+                                 int16x8_t *const s2, int16x8_t *const s3,
+                                 int16x8_t *const s4, int16x8_t *const s5,
+                                 int16x8_t *const s6, int16x8_t *const s7,
+                                 int16x8_t *const s8, int16x8_t *const s9,
+                                 int16x8_t *const s10, int16x8_t *const s11) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+  s += p;
+  *s8 = vld1q_s16(s);
+  s += p;
+  *s9 = vld1q_s16(s);
+  s += p;
+  *s10 = vld1q_s16(s);
+  s += p;
+  *s11 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x11(const uint16_t *s, ptrdiff_t p,
+                                 uint16x8_t *const s0, uint16x8_t *const s1,
+                                 uint16x8_t *const s2, uint16x8_t *const s3,
+                                 uint16x8_t *const s4, uint16x8_t *const s5,
+                                 uint16x8_t *const s6, uint16x8_t *const s7,
+                                 uint16x8_t *const s8, uint16x8_t *const s9,
+                                 uint16x8_t *const s10) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+  s += p;
+  *s8 = vld1q_u16(s);
+  s += p;
+  *s9 = vld1q_u16(s);
+  s += p;
+  *s10 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x8(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6, int16x8_t *const s7) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+  s += p;
+  *s7 = vld1q_s16(s);
+}
+
+static INLINE void load_u16_8x7(const uint16_t *s, ptrdiff_t p,
+                                uint16x8_t *const s0, uint16x8_t *const s1,
+                                uint16x8_t *const s2, uint16x8_t *const s3,
+                                uint16x8_t *const s4, uint16x8_t *const s5,
+                                uint16x8_t *const s6) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+}
+
+static INLINE void load_s16_8x7(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5,
+                                int16x8_t *const s6) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+  s += p;
+  *s6 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x6(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4, int16x8_t *const s5) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+  s += p;
+  *s5 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x5(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3,
+                                int16x8_t *const s4) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+  s += p;
+  *s4 = vld1q_s16(s);
+}
+
+static INLINE void load_s16_8x4(const int16_t *s, ptrdiff_t p,
+                                int16x8_t *const s0, int16x8_t *const s1,
+                                int16x8_t *const s2, int16x8_t *const s3) {
+  *s0 = vld1q_s16(s);
+  s += p;
+  *s1 = vld1q_s16(s);
+  s += p;
+  *s2 = vld1q_s16(s);
+  s += p;
+  *s3 = vld1q_s16(s);
+}
+
+// Load 2 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
+  uint32_t a;
+  memcpy(&a, buf, 4);
+  buf += stride;
+  uint32x2_t a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+// Load 4 sets of 4 bytes when alignment is not guaranteed.
+static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x4_t a_u32;
+  if (stride == 4) return vld1q_u8(buf);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vdupq_n_u32(a);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 1);
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vsetq_lane_u32(a, a_u32, 2);
+  memcpy(&a, buf, 4);
+  a_u32 = vsetq_lane_u32(a, a_u32, 3);
+  return vreinterpretq_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_2x2(const uint8_t *buf, int stride) {
+  uint16_t a;
+  uint16x4_t a_u16;
+
+  memcpy(&a, buf, 2);
+  buf += stride;
+  a_u16 = vdup_n_u16(a);
+  memcpy(&a, buf, 2);
+  a_u16 = vset_lane_u16(a, a_u16, 1);
+  return vreinterpret_u8_u16(a_u16);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x1(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(0);
+  a_u32 = vset_lane_u32(a, a_u32, 0);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_4x2(const uint8_t *buf) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  a_u32 = vdup_n_u32(a);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_dup_u8_2x4(const uint8_t *buf) {
+  uint16_t a;
+  uint16x4_t a_u32;
+
+  memcpy(&a, buf, 2);
+  a_u32 = vdup_n_u16(a);
+  return vreinterpret_u8_u16(a_u32);
+}
+
+static INLINE uint8x8_t load_unaligned_u8_4x2(const uint8_t *buf, int stride) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u8_u32(a_u32);
+}
+
+static INLINE void load_unaligned_u8_4x4(const uint8_t *buf, int stride,
+                                         uint8x8_t *tu0, uint8x8_t *tu1) {
+  *tu0 = load_unaligned_u8_4x2(buf, stride);
+  buf += 2 * stride;
+  *tu1 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_3x8(const uint8_t *buf, int stride,
+                                         uint8x8_t *tu0, uint8x8_t *tu1,
+                                         uint8x8_t *tu2) {
+  load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+  buf += 4 * stride;
+  *tu2 = load_unaligned_u8_4x2(buf, stride);
+}
+
+static INLINE void load_unaligned_u8_4x8(const uint8_t *buf, int stride,
+                                         uint8x8_t *tu0, uint8x8_t *tu1,
+                                         uint8x8_t *tu2, uint8x8_t *tu3) {
+  load_unaligned_u8_4x4(buf, stride, tu0, tu1);
+  buf += 4 * stride;
+  load_unaligned_u8_4x4(buf, stride, tu2, tu3);
+}
+
+static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3,
+                                uint8x16_t *const s4, uint8x16_t *const s5,
+                                uint8x16_t *const s6, uint8x16_t *const s7) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+  s += p;
+  *s4 = vld1q_u8(s);
+  s += p;
+  *s5 = vld1q_u8(s);
+  s += p;
+  *s6 = vld1q_u8(s);
+  s += p;
+  *s7 = vld1q_u8(s);
+}
+
+static INLINE void load_u8_16x4(const uint8_t *s, ptrdiff_t p,
+                                uint8x16_t *const s0, uint8x16_t *const s1,
+                                uint8x16_t *const s2, uint8x16_t *const s3) {
+  *s0 = vld1q_u8(s);
+  s += p;
+  *s1 = vld1q_u8(s);
+  s += p;
+  *s2 = vld1q_u8(s);
+  s += p;
+  *s3 = vld1q_u8(s);
+}
+
+static INLINE void load_u16_8x8(const uint16_t *s, const ptrdiff_t p,
+                                uint16x8_t *s0, uint16x8_t *s1, uint16x8_t *s2,
+                                uint16x8_t *s3, uint16x8_t *s4, uint16x8_t *s5,
+                                uint16x8_t *s6, uint16x8_t *s7) {
+  *s0 = vld1q_u16(s);
+  s += p;
+  *s1 = vld1q_u16(s);
+  s += p;
+  *s2 = vld1q_u16(s);
+  s += p;
+  *s3 = vld1q_u16(s);
+  s += p;
+  *s4 = vld1q_u16(s);
+  s += p;
+  *s5 = vld1q_u16(s);
+  s += p;
+  *s6 = vld1q_u16(s);
+  s += p;
+  *s7 = vld1q_u16(s);
+}
+
+static INLINE void load_u16_16x4(const uint16_t *s, ptrdiff_t p,
+                                 uint16x8_t *const s0, uint16x8_t *const s1,
+                                 uint16x8_t *const s2, uint16x8_t *const s3,
+                                 uint16x8_t *const s4, uint16x8_t *const s5,
+                                 uint16x8_t *const s6, uint16x8_t *const s7) {
+  *s0 = vld1q_u16(s);
+  *s1 = vld1q_u16(s + 8);
+  s += p;
+  *s2 = vld1q_u16(s);
+  *s3 = vld1q_u16(s + 8);
+  s += p;
+  *s4 = vld1q_u16(s);
+  *s5 = vld1q_u16(s + 8);
+  s += p;
+  *s6 = vld1q_u16(s);
+  *s7 = vld1q_u16(s + 8);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_2x2(const uint16_t *buf,
+                                                int stride) {
+  uint32_t a;
+  uint32x2_t a_u32;
+
+  memcpy(&a, buf, 4);
+  buf += stride;
+  a_u32 = vdup_n_u32(a);
+  memcpy(&a, buf, 4);
+  a_u32 = vset_lane_u32(a, a_u32, 1);
+  return vreinterpret_u16_u32(a_u32);
+}
+
+static INLINE uint16x4_t load_unaligned_u16_4x1(const uint16_t *buf) {
+  uint64_t a;
+  uint64x1_t a_u64 = vdup_n_u64(0);
+  memcpy(&a, buf, 8);
+  a_u64 = vset_lane_u64(a, a_u64, 0);
+  return vreinterpret_u16_u64(a_u64);
+}
+
+static INLINE uint16x8_t load_unaligned_u16_4x2(const uint16_t *buf,
+                                                uint32_t stride) {
+  uint64_t a;
+  uint64x2_t a_u64;
+
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vdupq_n_u64(0);
+  a_u64 = vsetq_lane_u64(a, a_u64, 0);
+  memcpy(&a, buf, 8);
+  buf += stride;
+  a_u64 = vsetq_lane_u64(a, a_u64, 1);
+  return vreinterpretq_u16_u64(a_u64);
+}
+
+static INLINE void load_unaligned_u16_4x4(const uint16_t *buf, uint32_t stride,
+                                          uint16x8_t *tu0, uint16x8_t *tu1) {
+  *tu0 = load_unaligned_u16_4x2(buf, stride);
+  buf += 2 * stride;
+  *tu1 = load_unaligned_u16_4x2(buf, stride);
+}
+
+static INLINE void load_s32_4x4(int32_t *s, int32_t p, int32x4_t *s1,
+                                int32x4_t *s2, int32x4_t *s3, int32x4_t *s4) {
+  *s1 = vld1q_s32(s);
+  s += p;
+  *s2 = vld1q_s32(s);
+  s += p;
+  *s3 = vld1q_s32(s);
+  s += p;
+  *s4 = vld1q_s32(s);
+}
+
+static INLINE void store_s32_4x4(int32_t *s, int32_t p, int32x4_t s1,
+                                 int32x4_t s2, int32x4_t s3, int32x4_t s4) {
+  vst1q_s32(s, s1);
+  s += p;
+  vst1q_s32(s, s2);
+  s += p;
+  vst1q_s32(s, s3);
+  s += p;
+  vst1q_s32(s, s4);
+}
+
+static INLINE void load_u32_4x4(uint32_t *s, int32_t p, uint32x4_t *s1,
+                                uint32x4_t *s2, uint32x4_t *s3,
+                                uint32x4_t *s4) {
+  *s1 = vld1q_u32(s);
+  s += p;
+  *s2 = vld1q_u32(s);
+  s += p;
+  *s3 = vld1q_u32(s);
+  s += p;
+  *s4 = vld1q_u32(s);
+}
+
+static INLINE void store_u32_4x4(uint32_t *s, int32_t p, uint32x4_t s1,
+                                 uint32x4_t s2, uint32x4_t s3, uint32x4_t s4) {
+  vst1q_u32(s, s1);
+  s += p;
+  vst1q_u32(s, s2);
+  s += p;
+  vst1q_u32(s, s3);
+  s += p;
+  vst1q_u32(s, s4);
+}
+
+static INLINE int16x8_t load_tran_low_to_s16q(const tran_low_t *buf) {
+  const int32x4_t v0 = vld1q_s32(buf);
+  const int32x4_t v1 = vld1q_s32(buf + 4);
+  const int16x4_t s0 = vmovn_s32(v0);
+  const int16x4_t s1 = vmovn_s32(v1);
+  return vcombine_s16(s0, s1);
+}
+
+static INLINE void store_s16q_to_tran_low(tran_low_t *buf, const int16x8_t a) {
+  const int32x4_t v0 = vmovl_s16(vget_low_s16(a));
+  const int32x4_t v1 = vmovl_s16(vget_high_s16(a));
+  vst1q_s32(buf, v0);
+  vst1q_s32(buf + 4, v1);
+}
+
+static INLINE void store_s16_to_tran_low(tran_low_t *buf, const int16x4_t a) {
+  const int32x4_t v0 = vmovl_s16(a);
+  vst1q_s32(buf, v0);
+}
+
+static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src,
+                                              int16x8_t indices) {
+  // Recent Clang and GCC versions correctly identify that this zero-broadcast
+  // is redundant. Alternatively we could load and broadcast the zeroth element
+  // and then replace the other lanes, however this is slower than loading a
+  // single element without broadcast on some micro-architectures.
+  uint8x8_t ret = vdup_n_u8(0);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6);
+  ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7);
+  return ret;
+}
+
+// The `lane` parameter here must be an immediate.
+#define store_u8_2x1_lane(dst, src, lane)                       \
+  do {                                                          \
+    uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \
+    memcpy(dst, &a, 2);                                         \
+  } while (0)
+
+#define store_u8_4x1_lane(dst, src, lane)                       \
+  do {                                                          \
+    uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \
+    memcpy(dst, &a, 4);                                         \
+  } while (0)
+
+#define store_u16_2x1_lane(dst, src, lane)                       \
+  do {                                                           \
+    uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \
+    memcpy(dst, &a, 4);                                          \
+  } while (0)
+
+#define store_u16_4x1_lane(dst, src, lane)                         \
+  do {                                                             \
+    uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \
+    memcpy(dst, &a, 8);                                            \
+  } while (0)
+
+// Store the low 16-bits from a single vector.
+static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) {
+  store_u8_2x1_lane(dst, src, 0);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) {
+  store_u8_4x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 16-bits from a single vector.
+static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride,
+                                         uint8x8_t src) {
+  store_u8_2x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_u8_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride,
+                                         uint8x8_t src) {
+  store_u8_4x1_lane(dst, src, 0);
+  dst += stride;
+  store_u8_4x1_lane(dst, src, 1);
+}
+
+// Store four blocks of 32-bits from a single vector.
+static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride,
+                                         uint8x16_t src) {
+  store_u8_4x1_lane(dst, vget_low_u8(src), 0);
+  dst += stride;
+  store_u8_4x1_lane(dst, vget_low_u8(src), 1);
+  dst += stride;
+  store_u8_4x1_lane(dst, vget_high_u8(src), 0);
+  dst += stride;
+  store_u8_4x1_lane(dst, vget_high_u8(src), 1);
+}
+
+// Store the low 32-bits from a single vector.
+static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) {
+  store_u16_2x1_lane(dst, src, 0);
+}
+
+// Store two blocks of 32-bits from a single vector.
+static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride,
+                                          uint16x4_t src) {
+  store_u16_2x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_u16_2x1_lane(dst, src, 1);
+}
+
+// Store two blocks of 64-bits from a single vector.
+static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride,
+                                          uint16x8_t src) {
+  store_u16_4x1_lane(dst, src, 0);
+  dst += dst_stride;
+  store_u16_4x1_lane(dst, src, 1);
+}
+
+#undef store_u8_2x1_lane
+#undef store_u8_4x1_lane
+#undef store_u16_2x1_lane
+#undef store_u16_4x1_lane
+
+#endif  // AOM_AOM_DSP_ARM_MEM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/obmc_sad_neon.c b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
new file mode 100644
index 0000000000..a692cbb388
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_sad_neon.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_sad_8x1_s16_neon(int16x8_t ref_s16, const int32_t *mask,
+                                         const int32_t *wsrc, uint32x4_t *sum) {
+  int32x4_t wsrc_lo = vld1q_s32(wsrc);
+  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+
+  int32x4_t mask_lo = vld1q_s32(mask);
+  int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+  int16x8_t mask_s16 =
+      vuzpq_s16(vreinterpretq_s16_s32(mask_lo), vreinterpretq_s16_s32(mask_hi))
+          .val[0];
+
+  int32x4_t pre_lo = vmull_s16(vget_low_s16(ref_s16), vget_low_s16(mask_s16));
+  int32x4_t pre_hi = vmull_s16(vget_high_s16(ref_s16), vget_high_s16(mask_s16));
+
+  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+  *sum = vrsraq_n_u32(*sum, abs_lo, 12);
+  *sum = vrsraq_n_u32(*sum, abs_hi, 12);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+  0,  255, 255, 255, 1,  255, 255, 255, 2,  255, 255, 255, 3,  255, 255, 255,
+  4,  255, 255, 255, 5,  255, 255, 255, 6,  255, 255, 255, 7,  255, 255, 255,
+  8,  255, 255, 255, 9,  255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+  12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_sad_8x1_s32_neon(uint32x4_t ref_u32_lo,
+                                         uint32x4_t ref_u32_hi,
+                                         const int32_t *mask,
+                                         const int32_t *wsrc,
+                                         uint32x4_t sum[2]) {
+  int32x4_t wsrc_lo = vld1q_s32(wsrc);
+  int32x4_t wsrc_hi = vld1q_s32(wsrc + 4);
+  int32x4_t mask_lo = vld1q_s32(mask);
+  int32x4_t mask_hi = vld1q_s32(mask + 4);
+
+  int32x4_t pre_lo = vmulq_s32(vreinterpretq_s32_u32(ref_u32_lo), mask_lo);
+  int32x4_t pre_hi = vmulq_s32(vreinterpretq_s32_u32(ref_u32_hi), mask_hi);
+
+  uint32x4_t abs_lo = vreinterpretq_u32_s32(vabdq_s32(wsrc_lo, pre_lo));
+  uint32x4_t abs_hi = vreinterpretq_u32_s32(vabdq_s32(wsrc_hi, pre_hi));
+
+  sum[0] = vrsraq_n_u32(sum[0], abs_lo, 12);
+  sum[1] = vrsraq_n_u32(sum[1], abs_hi, 12);
+}
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+                                               int ref_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *mask, int width,
+                                               int height) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  // Use tbl for doing a double-width zero extension from 8->32 bits since we
+  // can do this in one instruction rather than two.
+  uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+  uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+  uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+  uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+  int h = height;
+  do {
+    int w = width;
+    const uint8_t *ref_ptr = ref;
+    do {
+      uint8x16_t r = vld1q_u8(ref_ptr);
+
+      uint32x4_t ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx0));
+      uint32x4_t ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx1));
+      obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask, wsrc, sum);
+
+      ref_u32_lo = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx2));
+      ref_u32_hi = vreinterpretq_u32_u8(vqtbl1q_u8(r, pre_idx3));
+      obmc_sad_8x1_s32_neon(ref_u32_lo, ref_u32_hi, mask + 8, wsrc + 8, sum);
+
+      ref_ptr += 16;
+      wsrc += 16;
+      mask += 16;
+      w -= 16;
+    } while (w != 0);
+
+    ref += ref_stride;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#else  // !AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_large_neon(const uint8_t *ref,
+                                               int ref_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *mask, int width,
+                                               int height) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    const uint8_t *ref_ptr = ref;
+    do {
+      uint8x16_t r = vld1q_u8(ref_ptr);
+
+      int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r)));
+      obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+      ref_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r)));
+      obmc_sad_8x1_s16_neon(ref_s16, mask + 8, wsrc + 8, &sum);
+
+      ref_ptr += 16;
+      wsrc += 16;
+      mask += 16;
+      w -= 16;
+    } while (w != 0);
+
+    ref += ref_stride;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+#endif  // AOM_ARCH_AARCH64
+
+static INLINE unsigned int obmc_sad_128xh_neon(const uint8_t *ref,
+                                               int ref_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *mask, int h) {
+  return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 128, h);
+}
+
+static INLINE unsigned int obmc_sad_64xh_neon(const uint8_t *ref,
+                                              int ref_stride,
+                                              const int32_t *wsrc,
+                                              const int32_t *mask, int h) {
+  return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 64, h);
+}
+
+static INLINE unsigned int obmc_sad_32xh_neon(const uint8_t *ref,
+                                              int ref_stride,
+                                              const int32_t *wsrc,
+                                              const int32_t *mask, int h) {
+  return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 32, h);
+}
+
+static INLINE unsigned int obmc_sad_16xh_neon(const uint8_t *ref,
+                                              int ref_stride,
+                                              const int32_t *wsrc,
+                                              const int32_t *mask, int h) {
+  return obmc_sad_large_neon(ref, ref_stride, wsrc, mask, 16, h);
+}
+
+static INLINE unsigned int obmc_sad_8xh_neon(const uint8_t *ref, int ref_stride,
+                                             const int32_t *wsrc,
+                                             const int32_t *mask, int height) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height;
+  do {
+    uint8x8_t r = vld1_u8(ref);
+
+    int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+    obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+    ref += ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+static INLINE unsigned int obmc_sad_4xh_neon(const uint8_t *ref, int ref_stride,
+                                             const int32_t *wsrc,
+                                             const int32_t *mask, int height) {
+  uint32x4_t sum = vdupq_n_u32(0);
+
+  int h = height / 2;
+  do {
+    uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+    int16x8_t ref_s16 = vreinterpretq_s16_u16(vmovl_u8(r));
+    obmc_sad_8x1_s16_neon(ref_s16, mask, wsrc, &sum);
+
+    ref += 2 * ref_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u32x4(sum);
+}
+
+#define OBMC_SAD_WXH_NEON(w, h)                                   \
+  unsigned int aom_obmc_sad##w##x##h##_neon(                      \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc,    \
+      const int32_t *mask) {                                      \
+    return obmc_sad_##w##xh_neon(ref, ref_stride, wsrc, mask, h); \
+  }
+
+OBMC_SAD_WXH_NEON(4, 4)
+OBMC_SAD_WXH_NEON(4, 8)
+OBMC_SAD_WXH_NEON(4, 16)
+
+OBMC_SAD_WXH_NEON(8, 4)
+OBMC_SAD_WXH_NEON(8, 8)
+OBMC_SAD_WXH_NEON(8, 16)
+OBMC_SAD_WXH_NEON(8, 32)
+
+OBMC_SAD_WXH_NEON(16, 4)
+OBMC_SAD_WXH_NEON(16, 8)
+OBMC_SAD_WXH_NEON(16, 16)
+OBMC_SAD_WXH_NEON(16, 32)
+OBMC_SAD_WXH_NEON(16, 64)
+
+OBMC_SAD_WXH_NEON(32, 8)
+OBMC_SAD_WXH_NEON(32, 16)
+OBMC_SAD_WXH_NEON(32, 32)
+OBMC_SAD_WXH_NEON(32, 64)
+
+OBMC_SAD_WXH_NEON(64, 16)
+OBMC_SAD_WXH_NEON(64, 32)
+OBMC_SAD_WXH_NEON(64, 64)
+OBMC_SAD_WXH_NEON(64, 128)
+
+OBMC_SAD_WXH_NEON(128, 64)
+OBMC_SAD_WXH_NEON(128, 128)
diff --git a/third_party/aom/aom_dsp/arm/obmc_variance_neon.c b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
new file mode 100644
index 0000000000..50cd5f3b6a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/obmc_variance_neon.c
@@ -0,0 +1,290 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+#include "mem_neon.h"
+#include "sum_neon.h"
+
+static INLINE void obmc_variance_8x1_s16_neon(int16x8_t pre_s16,
+                                              const int32_t *wsrc,
+                                              const int32_t *mask,
+                                              int32x4_t *ssev,
+                                              int32x4_t *sumv) {
+  // For 4xh and 8xh we observe it is faster to avoid the double-widening of
+  // pre. Instead we do a single widening step and narrow the mask to 16-bits
+  // to allow us to perform a widening multiply. Widening multiply
+  // instructions have better throughput on some micro-architectures but for
+  // the larger block sizes this benefit is outweighed by the additional
+  // instruction needed to first narrow the mask vectors.
+
+  int32x4_t wsrc_s32_lo = vld1q_s32(&wsrc[0]);
+  int32x4_t wsrc_s32_hi = vld1q_s32(&wsrc[4]);
+  int16x8_t mask_s16 = vuzpq_s16(vreinterpretq_s16_s32(vld1q_s32(&mask[0])),
+                                 vreinterpretq_s16_s32(vld1q_s32(&mask[4])))
+                           .val[0];
+
+  int32x4_t diff_s32_lo =
+      vmlsl_s16(wsrc_s32_lo, vget_low_s16(pre_s16), vget_low_s16(mask_s16));
+  int32x4_t diff_s32_hi =
+      vmlsl_s16(wsrc_s32_hi, vget_high_s16(pre_s16), vget_high_s16(mask_s16));
+
+  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away
+  // from zero, however vrshrq_n_s32 rounds to nearest with ties rounded up.
+  // This difference only affects the bit patterns at the rounding breakpoints
+  // exactly, so we can add -1 to all negative numbers to move the breakpoint
+  // one value across and into the correct rounding region.
+  diff_s32_lo = vsraq_n_s32(diff_s32_lo, diff_s32_lo, 31);
+  diff_s32_hi = vsraq_n_s32(diff_s32_hi, diff_s32_hi, 31);
+  int32x4_t round_s32_lo = vrshrq_n_s32(diff_s32_lo, 12);
+  int32x4_t round_s32_hi = vrshrq_n_s32(diff_s32_hi, 12);
+
+  *sumv = vrsraq_n_s32(*sumv, diff_s32_lo, 12);
+  *sumv = vrsraq_n_s32(*sumv, diff_s32_hi, 12);
+  *ssev = vmlaq_s32(*ssev, round_s32_lo, round_s32_lo);
+  *ssev = vmlaq_s32(*ssev, round_s32_hi, round_s32_hi);
+}
+
+#if AOM_ARCH_AARCH64
+
+// Use tbl for doing a double-width zero extension from 8->32 bits since we can
+// do this in one instruction rather than two (indices out of range (255 here)
+// are set to zero by tbl).
+DECLARE_ALIGNED(16, static const uint8_t, obmc_variance_permute_idx[]) = {
+  0,  255, 255, 255, 1,  255, 255, 255, 2,  255, 255, 255, 3,  255, 255, 255,
+  4,  255, 255, 255, 5,  255, 255, 255, 6,  255, 255, 255, 7,  255, 255, 255,
+  8,  255, 255, 255, 9,  255, 255, 255, 10, 255, 255, 255, 11, 255, 255, 255,
+  12, 255, 255, 255, 13, 255, 255, 255, 14, 255, 255, 255, 15, 255, 255, 255
+};
+
+static INLINE void obmc_variance_8x1_s32_neon(
+    int32x4_t pre_lo, int32x4_t pre_hi, const int32_t *wsrc,
+    const int32_t *mask, int32x4_t *ssev, int32x4_t *sumv) {
+  int32x4_t wsrc_lo = vld1q_s32(&wsrc[0]);
+  int32x4_t wsrc_hi = vld1q_s32(&wsrc[4]);
+  int32x4_t mask_lo = vld1q_s32(&mask[0]);
+  int32x4_t mask_hi = vld1q_s32(&mask[4]);
+
+  int32x4_t diff_lo = vmlsq_s32(wsrc_lo, pre_lo, mask_lo);
+  int32x4_t diff_hi = vmlsq_s32(wsrc_hi, pre_hi, mask_hi);
+
+  // ROUND_POWER_OF_TWO_SIGNED(value, 12) rounds to nearest with ties away from
+  // zero, however vrshrq_n_s32 rounds to nearest with ties rounded up. This
+  // difference only affects the bit patterns at the rounding breakpoints
+  // exactly, so we can add -1 to all negative numbers to move the breakpoint
+  // one value across and into the correct rounding region.
+  diff_lo = vsraq_n_s32(diff_lo, diff_lo, 31);
+  diff_hi = vsraq_n_s32(diff_hi, diff_hi, 31);
+  int32x4_t round_lo = vrshrq_n_s32(diff_lo, 12);
+  int32x4_t round_hi = vrshrq_n_s32(diff_hi, 12);
+
+  *sumv = vrsraq_n_s32(*sumv, diff_lo, 12);
+  *sumv = vrsraq_n_s32(*sumv, diff_hi, 12);
+  *ssev = vmlaq_s32(*ssev, round_lo, round_lo);
+  *ssev = vmlaq_s32(*ssev, round_hi, round_hi);
+}
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask, int width,
+                                            int height, unsigned *sse,
+                                            int *sum) {
+  assert(width % 16 == 0);
+
+  // Use tbl for doing a double-width zero extension from 8->32 bits since we
+  // can do this in one instruction rather than two.
+  uint8x16_t pre_idx0 = vld1q_u8(&obmc_variance_permute_idx[0]);
+  uint8x16_t pre_idx1 = vld1q_u8(&obmc_variance_permute_idx[16]);
+  uint8x16_t pre_idx2 = vld1q_u8(&obmc_variance_permute_idx[32]);
+  uint8x16_t pre_idx3 = vld1q_u8(&obmc_variance_permute_idx[48]);
+
+  int32x4_t ssev = vdupq_n_s32(0);
+  int32x4_t sumv = vdupq_n_s32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    do {
+      uint8x16_t pre_u8 = vld1q_u8(pre);
+
+      int32x4_t pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx0));
+      int32x4_t pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx1));
+      obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[0], &mask[0],
+                                 &ssev, &sumv);
+
+      pre_s32_lo = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx2));
+      pre_s32_hi = vreinterpretq_s32_u8(vqtbl1q_u8(pre_u8, pre_idx3));
+      obmc_variance_8x1_s32_neon(pre_s32_lo, pre_s32_hi, &wsrc[8], &mask[8],
+                                 &ssev, &sumv);
+
+      wsrc += 16;
+      mask += 16;
+      pre += 16;
+      w -= 16;
+    } while (w != 0);
+
+    pre += pre_stride - width;
+  } while (--h != 0);
+
+  *sse = horizontal_add_s32x4(ssev);
+  *sum = horizontal_add_s32x4(sumv);
+}
+
+#else  // !AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_large_neon(const uint8_t *pre, int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask, int width,
+                                            int height, unsigned *sse,
+                                            int *sum) {
+  // Non-aarch64 targets do not have a 128-bit tbl instruction, so use the
+  // widening version of the core kernel instead.
+
+  assert(width % 16 == 0);
+
+  int32x4_t ssev = vdupq_n_s32(0);
+  int32x4_t sumv = vdupq_n_s32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    do {
+      uint8x16_t pre_u8 = vld1q_u8(pre);
+
+      int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pre_u8)));
+      obmc_variance_8x1_s16_neon(pre_s16, &wsrc[0], &mask[0], &ssev, &sumv);
+
+      pre_s16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pre_u8)));
+      obmc_variance_8x1_s16_neon(pre_s16, &wsrc[8], &mask[8], &ssev, &sumv);
+
+      wsrc += 16;
+      mask += 16;
+      pre += 16;
+      w -= 16;
+    } while (w != 0);
+
+    pre += pre_stride - width;
+  } while (--h != 0);
+
+  *sse = horizontal_add_s32x4(ssev);
+  *sum = horizontal_add_s32x4(sumv);
+}
+
+#endif  // AOM_ARCH_AARCH64
+
+static INLINE void obmc_variance_neon_128xh(const uint8_t *pre, int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask, int h,
+                                            unsigned *sse, int *sum) {
+  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 128, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_64xh(const uint8_t *pre, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int h,
+                                           unsigned *sse, int *sum) {
+  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 64, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_32xh(const uint8_t *pre, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int h,
+                                           unsigned *sse, int *sum) {
+  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 32, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_16xh(const uint8_t *pre, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int h,
+                                           unsigned *sse, int *sum) {
+  obmc_variance_large_neon(pre, pre_stride, wsrc, mask, 16, h, sse, sum);
+}
+
+static INLINE void obmc_variance_neon_8xh(const uint8_t *pre, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int h,
+                                          unsigned *sse, int *sum) {
+  int32x4_t ssev = vdupq_n_s32(0);
+  int32x4_t sumv = vdupq_n_s32(0);
+
+  do {
+    uint8x8_t pre_u8 = vld1_u8(pre);
+    int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+    obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+    pre += pre_stride;
+    wsrc += 8;
+    mask += 8;
+  } while (--h != 0);
+
+  *sse = horizontal_add_s32x4(ssev);
+  *sum = horizontal_add_s32x4(sumv);
+}
+
+static INLINE void obmc_variance_neon_4xh(const uint8_t *pre, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int h,
+                                          unsigned *sse, int *sum) {
+  assert(h % 2 == 0);
+
+  int32x4_t ssev = vdupq_n_s32(0);
+  int32x4_t sumv = vdupq_n_s32(0);
+
+  do {
+    uint8x8_t pre_u8 = load_unaligned_u8(pre, pre_stride);
+    int16x8_t pre_s16 = vreinterpretq_s16_u16(vmovl_u8(pre_u8));
+
+    obmc_variance_8x1_s16_neon(pre_s16, wsrc, mask, &ssev, &sumv);
+
+    pre += 2 * pre_stride;
+    wsrc += 8;
+    mask += 8;
+    h -= 2;
+  } while (h != 0);
+
+  *sse = horizontal_add_s32x4(ssev);
+  *sum = horizontal_add_s32x4(sumv);
+}
+
+#define OBMC_VARIANCE_WXH_NEON(W, H)                                       \
+  unsigned aom_obmc_variance##W##x##H##_neon(                              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned *sse) {                                \
+    int sum;                                                               \
+    obmc_variance_neon_##W##xh(pre, pre_stride, wsrc, mask, H, sse, &sum); \
+    return *sse - (unsigned)(((int64_t)sum * sum) / (W * H));              \
+  }
+
+OBMC_VARIANCE_WXH_NEON(4, 4)
+OBMC_VARIANCE_WXH_NEON(4, 8)
+OBMC_VARIANCE_WXH_NEON(8, 4)
+OBMC_VARIANCE_WXH_NEON(8, 8)
+OBMC_VARIANCE_WXH_NEON(8, 16)
+OBMC_VARIANCE_WXH_NEON(16, 8)
+OBMC_VARIANCE_WXH_NEON(16, 16)
+OBMC_VARIANCE_WXH_NEON(16, 32)
+OBMC_VARIANCE_WXH_NEON(32, 16)
+OBMC_VARIANCE_WXH_NEON(32, 32)
+OBMC_VARIANCE_WXH_NEON(32, 64)
+OBMC_VARIANCE_WXH_NEON(64, 32)
+OBMC_VARIANCE_WXH_NEON(64, 64)
+OBMC_VARIANCE_WXH_NEON(64, 128)
+OBMC_VARIANCE_WXH_NEON(128, 64)
+OBMC_VARIANCE_WXH_NEON(128, 128)
+OBMC_VARIANCE_WXH_NEON(4, 16)
+OBMC_VARIANCE_WXH_NEON(16, 4)
+OBMC_VARIANCE_WXH_NEON(8, 32)
+OBMC_VARIANCE_WXH_NEON(32, 8)
+OBMC_VARIANCE_WXH_NEON(16, 64)
+OBMC_VARIANCE_WXH_NEON(64, 16)
diff --git a/third_party/aom/aom_dsp/arm/reinterpret_neon.h b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
new file mode 100644
index 0000000000..f9702513ad
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/reinterpret_neon.h
@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q)     \
+  static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t              \
+      aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n(                \
+          const u##int##from_sz##x##from_count##x##n##_t src) {             \
+    u##int##to_sz##x##to_count##x##n##_t ret;                               \
+    for (int i = 0; i < (n); ++i) {                                         \
+      ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \
+    }                                                                       \
+    return ret;                                                             \
+  }
+
+REINTERPRET_NEON(u, 8, 8, 16, 4, 2, )    // uint8x8x2_t from uint16x4x2_t
+REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q)  // uint8x16x2_t from uint16x8x2_t
+
+#endif  // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sad_neon.c b/third_party/aom/aom_dsp/arm/sad_neon.c
new file mode 100644
index 0000000000..46a1666331
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon.c
@@ -0,0 +1,873 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sad128xh_neon(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         int h) {
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    s4 = vld1q_u8(src_ptr + 64);
+    r4 = vld1q_u8(ref_ptr + 64);
+    diff4 = vabdq_u8(s4, r4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    s5 = vld1q_u8(src_ptr + 80);
+    r5 = vld1q_u8(ref_ptr + 80);
+    diff5 = vabdq_u8(s5, r5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    s6 = vld1q_u8(src_ptr + 96);
+    r6 = vld1q_u8(ref_ptr + 96);
+    diff6 = vabdq_u8(s6, r6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    s7 = vld1q_u8(src_ptr + 112);
+    r7 = vld1q_u8(ref_ptr + 112);
+    diff7 = vabdq_u8(s7, r7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3;
+    uint8x16_t diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    diff2 = vabdq_u8(s2, r2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    diff3 = vabdq_u8(s3, r3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t diff0 = vabdq_u8(s0, r0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t diff1 = vabdq_u8(s1, r1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_neon(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+
+    uint8x16_t diff = vabdq_u8(s, r);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_neon(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       int h) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+
+    sum = vabal_u8(sum, s, r);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_NEON(w, h)                                                   \
+  unsigned int aom_sad##w##x##h##_neon(const uint8_t *src, int src_stride,   \
+                                       const uint8_t *ref, int ref_stride) { \
+    return sad##w##xh_neon(src, src_stride, ref, ref_stride, (h));           \
+  }
+
+SAD_WXH_NEON(4, 4)
+SAD_WXH_NEON(4, 8)
+
+SAD_WXH_NEON(8, 4)
+SAD_WXH_NEON(8, 8)
+SAD_WXH_NEON(8, 16)
+
+SAD_WXH_NEON(16, 8)
+SAD_WXH_NEON(16, 16)
+SAD_WXH_NEON(16, 32)
+
+SAD_WXH_NEON(32, 16)
+SAD_WXH_NEON(32, 32)
+SAD_WXH_NEON(32, 64)
+
+SAD_WXH_NEON(64, 32)
+SAD_WXH_NEON(64, 64)
+SAD_WXH_NEON(64, 128)
+
+SAD_WXH_NEON(128, 64)
+SAD_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON(4, 16)
+SAD_WXH_NEON(8, 32)
+SAD_WXH_NEON(16, 4)
+SAD_WXH_NEON(16, 64)
+SAD_WXH_NEON(32, 8)
+SAD_WXH_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON
+
+#define SAD_SKIP_WXH_NEON(w, h)                                                \
+  unsigned int aom_sad_skip_##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON(4, 4)
+SAD_SKIP_WXH_NEON(4, 8)
+
+SAD_SKIP_WXH_NEON(8, 4)
+SAD_SKIP_WXH_NEON(8, 8)
+SAD_SKIP_WXH_NEON(8, 16)
+
+SAD_SKIP_WXH_NEON(16, 8)
+SAD_SKIP_WXH_NEON(16, 16)
+SAD_SKIP_WXH_NEON(16, 32)
+
+SAD_SKIP_WXH_NEON(32, 16)
+SAD_SKIP_WXH_NEON(32, 32)
+SAD_SKIP_WXH_NEON(32, 64)
+
+SAD_SKIP_WXH_NEON(64, 32)
+SAD_SKIP_WXH_NEON(64, 64)
+SAD_SKIP_WXH_NEON(64, 128)
+
+SAD_SKIP_WXH_NEON(128, 64)
+SAD_SKIP_WXH_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON(4, 16)
+SAD_SKIP_WXH_NEON(8, 32)
+SAD_SKIP_WXH_NEON(16, 4)
+SAD_SKIP_WXH_NEON(16, 64)
+SAD_SKIP_WXH_NEON(32, 8)
+SAD_SKIP_WXH_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON
+
+static INLINE unsigned int sad128xh_avg_neon(const uint8_t *src_ptr,
+                                             int src_stride,
+                                             const uint8_t *ref_ptr,
+                                             int ref_stride, int h,
+                                             const uint8_t *second_pred) {
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
+    uint8x16_t r0, r1, r2, r3, r4, r5, r6, r7;
+    uint8x16_t p0, p1, p2, p3, p4, p5, p6, p7;
+    uint8x16_t avg0, avg1, avg2, avg3, avg4, avg5, avg6, avg7;
+    uint8x16_t diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    s4 = vld1q_u8(src_ptr + 64);
+    r4 = vld1q_u8(ref_ptr + 64);
+    p4 = vld1q_u8(second_pred + 64);
+    avg4 = vrhaddq_u8(r4, p4);
+    diff4 = vabdq_u8(s4, avg4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    s5 = vld1q_u8(src_ptr + 80);
+    r5 = vld1q_u8(ref_ptr + 80);
+    p5 = vld1q_u8(second_pred + 80);
+    avg5 = vrhaddq_u8(r5, p5);
+    diff5 = vabdq_u8(s5, avg5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    s6 = vld1q_u8(src_ptr + 96);
+    r6 = vld1q_u8(ref_ptr + 96);
+    p6 = vld1q_u8(second_pred + 96);
+    avg6 = vrhaddq_u8(r6, p6);
+    diff6 = vabdq_u8(s6, avg6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    s7 = vld1q_u8(src_ptr + 112);
+    r7 = vld1q_u8(ref_ptr + 112);
+    p7 = vld1q_u8(second_pred + 112);
+    avg7 = vrhaddq_u8(r7, p7);
+    diff7 = vabdq_u8(s7, avg7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad64xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0, s1, s2, s3, r0, r1, r2, r3, p0, p1, p2, p3;
+    uint8x16_t avg0, avg1, avg2, avg3, diff0, diff1, diff2, diff3;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    s1 = vld1q_u8(src_ptr + 16);
+    r1 = vld1q_u8(ref_ptr + 16);
+    p1 = vld1q_u8(second_pred + 16);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    s2 = vld1q_u8(src_ptr + 32);
+    r2 = vld1q_u8(ref_ptr + 32);
+    p2 = vld1q_u8(second_pred + 32);
+    avg2 = vrhaddq_u8(r2, p2);
+    diff2 = vabdq_u8(s2, avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    s3 = vld1q_u8(src_ptr + 48);
+    r3 = vld1q_u8(ref_ptr + 48);
+    p3 = vld1q_u8(second_pred + 48);
+    avg3 = vrhaddq_u8(r3, p3);
+    diff3 = vabdq_u8(s3, avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--i != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int sad32xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t avg0 = vrhaddq_u8(r0, p0);
+    uint8x16_t diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t avg1 = vrhaddq_u8(r1, p1);
+    uint8x16_t diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad16xh_avg_neon(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *ref_ptr,
+                                            int ref_stride, int h,
+                                            const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t avg = vrhaddq_u8(r, p);
+    uint8x16_t diff = vabdq_u8(s, avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad8xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int sad4xh_avg_neon(const uint8_t *src_ptr,
+                                           int src_stride,
+                                           const uint8_t *ref_ptr,
+                                           int ref_stride, int h,
+                                           const uint8_t *second_pred) {
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t avg = vrhadd_u8(r, p);
+    sum = vabal_u8(sum, s, avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define SAD_WXH_AVG_NEON(w, h)                                                 \
+  unsigned int aom_sad##w##x##h##_avg_neon(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref, int ref_stride, \
+                                           const uint8_t *second_pred) {       \
+    return sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h),          \
+                               second_pred);                                   \
+  }
+
+SAD_WXH_AVG_NEON(4, 4)
+SAD_WXH_AVG_NEON(4, 8)
+
+SAD_WXH_AVG_NEON(8, 4)
+SAD_WXH_AVG_NEON(8, 8)
+SAD_WXH_AVG_NEON(8, 16)
+
+SAD_WXH_AVG_NEON(16, 8)
+SAD_WXH_AVG_NEON(16, 16)
+SAD_WXH_AVG_NEON(16, 32)
+
+SAD_WXH_AVG_NEON(32, 16)
+SAD_WXH_AVG_NEON(32, 32)
+SAD_WXH_AVG_NEON(32, 64)
+
+SAD_WXH_AVG_NEON(64, 32)
+SAD_WXH_AVG_NEON(64, 64)
+SAD_WXH_AVG_NEON(64, 128)
+
+SAD_WXH_AVG_NEON(128, 64)
+SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON(4, 16)
+SAD_WXH_AVG_NEON(8, 32)
+SAD_WXH_AVG_NEON(16, 4)
+SAD_WXH_AVG_NEON(16, 64)
+SAD_WXH_AVG_NEON(32, 8)
+SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to prevent overflow for large values of 'h', as well
+  // as enabling optimal UADALP instruction throughput on CPUs that have either
+  // 2 or 4 Neon pipes.
+  uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vpadalq_u8(sum[4], diff4);
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vpadalq_u8(sum[5], diff5);
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vpadalq_u8(sum[6], diff6);
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vpadalq_u8(sum[7], diff7);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[4]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[5]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[6]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[7]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                        vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vpadalq_u8(sum[2], diff2);
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vpadalq_u8(sum[3], diff3);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  uint32x4_t sum_u32 = vpaddlq_u16(sum[0]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[1]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[2]);
+  sum_u32 = vpadalq_u16(sum_u32, sum[3]);
+
+  return horizontal_add_u32x4(sum_u32);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vpadalq_u8(sum[0], diff0);
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vpadalq_u8(sum[1], diff1);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(vaddq_u16(sum[0], sum[1]));
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x16_t s = vld1q_u8(src_ptr);
+    uint8x16_t r = vld1q_u8(ref_ptr);
+    uint8x16_t p = vld1q_u8(second_pred);
+
+    uint8x16_t wtd_avg = dist_wtd_avg_u8x16(p, r, bck_offset, fwd_offset);
+    uint8x16_t diff = vabdq_u8(s, wtd_avg);
+    sum = vpadalq_u8(sum, diff);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad8xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  do {
+    uint8x8_t s = vld1_u8(src_ptr);
+    uint8x8_t r = vld1_u8(ref_ptr);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 8;
+  } while (--h != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+static INLINE unsigned int dist_wtd_sad4xh_avg_neon(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  uint16x8_t sum = vdupq_n_u16(0);
+
+  int i = h / 2;
+  do {
+    uint8x8_t s = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref_ptr, ref_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+
+    uint8x8_t wtd_avg = dist_wtd_avg_u8x8(p, r, bck_offset, fwd_offset);
+    sum = vabal_u8(sum, s, wtd_avg);
+
+    src_ptr += 2 * src_stride;
+    ref_ptr += 2 * ref_stride;
+    second_pred += 8;
+  } while (--i != 0);
+
+  return horizontal_add_u16x8(sum);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON(w, h)                                        \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    return dist_wtd_sad##w##xh_avg_neon(src, src_stride, ref, ref_stride, (h), \
+                                        second_pred, jcp_param);               \
+  }
+
+DIST_WTD_SAD_WXH_AVG_NEON(4, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(4, 8)
+
+DIST_WTD_SAD_WXH_AVG_NEON(8, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 16)
+
+DIST_WTD_SAD_WXH_AVG_NEON(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON(4, 16)
+DIST_WTD_SAD_WXH_AVG_NEON(8, 32)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON
diff --git a/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
new file mode 100644
index 0000000000..5504c6838e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sad_neon_dotprod.c
@@ -0,0 +1,530 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE unsigned int sadwxh_neon_dotprod(const uint8_t *src_ptr,
+                                               int src_stride,
+                                               const uint8_t *ref_ptr,
+                                               int ref_stride, int w, int h) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      diff0 = vabdq_u8(s0, r0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      diff1 = vabdq_u8(s1, r1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_neon_dotprod(const uint8_t *src_ptr,
+                                                 int src_stride,
+                                                 const uint8_t *ref_ptr,
+                                                 int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128, h);
+}
+
+static INLINE unsigned int sad64xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64, h);
+}
+
+static INLINE unsigned int sad32xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  return sadwxh_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32, h);
+}
+
+static INLINE unsigned int sad16xh_neon_dotprod(const uint8_t *src_ptr,
+                                                int src_stride,
+                                                const uint8_t *ref_ptr,
+                                                int ref_stride, int h) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    diff0 = vabdq_u8(s0, r0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    diff1 = vabdq_u8(s1, r1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_NEON_DOTPROD(w, h)                                         \
+  unsigned int aom_sad##w##x##h##_neon_dotprod(                            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,              \
+      int ref_stride) {                                                    \
+    return sad##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, (h)); \
+  }
+
+SAD_WXH_NEON_DOTPROD(16, 8)
+SAD_WXH_NEON_DOTPROD(16, 16)
+SAD_WXH_NEON_DOTPROD(16, 32)
+
+SAD_WXH_NEON_DOTPROD(32, 16)
+SAD_WXH_NEON_DOTPROD(32, 32)
+SAD_WXH_NEON_DOTPROD(32, 64)
+
+SAD_WXH_NEON_DOTPROD(64, 32)
+SAD_WXH_NEON_DOTPROD(64, 64)
+SAD_WXH_NEON_DOTPROD(64, 128)
+
+SAD_WXH_NEON_DOTPROD(128, 64)
+SAD_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_NEON_DOTPROD(16, 4)
+SAD_WXH_NEON_DOTPROD(16, 64)
+SAD_WXH_NEON_DOTPROD(32, 8)
+SAD_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_NEON_DOTPROD(w, h)                          \
+  unsigned int aom_sad_skip_##w##x##h##_neon_dotprod(            \
+      const uint8_t *src, int src_stride, const uint8_t *ref,    \
+      int ref_stride) {                                          \
+    return 2 * sad##w##xh_neon_dotprod(src, 2 * src_stride, ref, \
+                                       2 * ref_stride, (h) / 2); \
+  }
+
+SAD_SKIP_WXH_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_NEON_DOTPROD
+
+static INLINE unsigned int sadwxh_avg_neon_dotprod(const uint8_t *src_ptr,
+                                                   int src_stride,
+                                                   const uint8_t *ref_ptr,
+                                                   int ref_stride, int w, int h,
+                                                   const uint8_t *second_pred) {
+  // Only two accumulators are required for optimal instruction throughput of
+  // the ABD, UDOT sequence on CPUs with either 2 or 4 Neon pipes.
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+      s0 = vld1q_u8(src_ptr + j);
+      r0 = vld1q_u8(ref_ptr + j);
+      p0 = vld1q_u8(second_pred);
+      avg0 = vrhaddq_u8(r0, p0);
+      diff0 = vabdq_u8(s0, avg0);
+      sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+      s1 = vld1q_u8(src_ptr + j + 16);
+      r1 = vld1q_u8(ref_ptr + j + 16);
+      p1 = vld1q_u8(second_pred + 16);
+      avg1 = vrhaddq_u8(r1, p1);
+      diff1 = vabdq_u8(s1, avg1);
+      sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+      j += 32;
+      second_pred += 32;
+    } while (j < w);
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+static INLINE unsigned int sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 128,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  return sadwxh_avg_neon_dotprod(src_ptr, src_stride, ref_ptr, ref_stride, 32,
+                                 h, second_pred);
+}
+
+static INLINE unsigned int sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred) {
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0, s1, r0, r1, p0, p1, avg0, avg1, diff0, diff1;
+
+    s0 = vld1q_u8(src_ptr);
+    r0 = vld1q_u8(ref_ptr);
+    p0 = vld1q_u8(second_pred);
+    avg0 = vrhaddq_u8(r0, p0);
+    diff0 = vabdq_u8(s0, avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    s1 = vld1q_u8(src_ptr);
+    r1 = vld1q_u8(ref_ptr);
+    p1 = vld1q_u8(second_pred);
+    avg1 = vrhaddq_u8(r1, p1);
+    diff1 = vabdq_u8(s1, avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sum[0], sum[1]));
+}
+
+#define SAD_WXH_AVG_NEON_DOTPROD(w, h)                                        \
+  unsigned int aom_sad##w##x##h##_avg_neon_dotprod(                           \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred) {                                           \
+    return sad##w##xh_avg_neon_dotprod(src, src_stride, ref, ref_stride, (h), \
+                                       second_pred);                          \
+  }
+
+SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_AVG_NEON_DOTPROD
+
+static INLINE unsigned int dist_wtd_sad128xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  // We use 8 accumulators to minimize the accumulation and loop carried
+  // dependencies for better instruction throughput.
+  uint32x4_t sum[8] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    uint8x16_t s4 = vld1q_u8(src_ptr + 64);
+    uint8x16_t r4 = vld1q_u8(ref_ptr + 64);
+    uint8x16_t p4 = vld1q_u8(second_pred + 64);
+    uint8x16_t wtd_avg4 = dist_wtd_avg_u8x16(p4, r4, bck_offset, fwd_offset);
+    uint8x16_t diff4 = vabdq_u8(s4, wtd_avg4);
+    sum[4] = vdotq_u32(sum[4], diff4, vdupq_n_u8(1));
+
+    uint8x16_t s5 = vld1q_u8(src_ptr + 80);
+    uint8x16_t r5 = vld1q_u8(ref_ptr + 80);
+    uint8x16_t p5 = vld1q_u8(second_pred + 80);
+    uint8x16_t wtd_avg5 = dist_wtd_avg_u8x16(p5, r5, bck_offset, fwd_offset);
+    uint8x16_t diff5 = vabdq_u8(s5, wtd_avg5);
+    sum[5] = vdotq_u32(sum[5], diff5, vdupq_n_u8(1));
+
+    uint8x16_t s6 = vld1q_u8(src_ptr + 96);
+    uint8x16_t r6 = vld1q_u8(ref_ptr + 96);
+    uint8x16_t p6 = vld1q_u8(second_pred + 96);
+    uint8x16_t wtd_avg6 = dist_wtd_avg_u8x16(p6, r6, bck_offset, fwd_offset);
+    uint8x16_t diff6 = vabdq_u8(s6, wtd_avg6);
+    sum[6] = vdotq_u32(sum[6], diff6, vdupq_n_u8(1));
+
+    uint8x16_t s7 = vld1q_u8(src_ptr + 112);
+    uint8x16_t r7 = vld1q_u8(ref_ptr + 112);
+    uint8x16_t p7 = vld1q_u8(second_pred + 112);
+    uint8x16_t wtd_avg7 = dist_wtd_avg_u8x16(p7, r7, bck_offset, fwd_offset);
+    uint8x16_t diff7 = vabdq_u8(s7, wtd_avg7);
+    sum[7] = vdotq_u32(sum[7], diff7, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 128;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[4] = vaddq_u32(sum[4], sum[5]);
+  sum[6] = vaddq_u32(sum[6], sum[7]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  sum[4] = vaddq_u32(sum[4], sum[6]);
+  sum[0] = vaddq_u32(sum[0], sum[4]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad64xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    uint8x16_t s2 = vld1q_u8(src_ptr + 32);
+    uint8x16_t r2 = vld1q_u8(ref_ptr + 32);
+    uint8x16_t p2 = vld1q_u8(second_pred + 32);
+    uint8x16_t wtd_avg2 = dist_wtd_avg_u8x16(p2, r2, bck_offset, fwd_offset);
+    uint8x16_t diff2 = vabdq_u8(s2, wtd_avg2);
+    sum[2] = vdotq_u32(sum[2], diff2, vdupq_n_u8(1));
+
+    uint8x16_t s3 = vld1q_u8(src_ptr + 48);
+    uint8x16_t r3 = vld1q_u8(ref_ptr + 48);
+    uint8x16_t p3 = vld1q_u8(second_pred + 48);
+    uint8x16_t wtd_avg3 = dist_wtd_avg_u8x16(p3, r3, bck_offset, fwd_offset);
+    uint8x16_t diff3 = vabdq_u8(s3, wtd_avg3);
+    sum[3] = vdotq_u32(sum[3], diff3, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 64;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  sum[2] = vaddq_u32(sum[2], sum[3]);
+  sum[0] = vaddq_u32(sum[0], sum[2]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad32xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    uint8x16_t s1 = vld1q_u8(src_ptr + 16);
+    uint8x16_t r1 = vld1q_u8(ref_ptr + 16);
+    uint8x16_t p1 = vld1q_u8(second_pred + 16);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 32;
+  } while (--h != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+static INLINE unsigned int dist_wtd_sad16xh_avg_neon_dotprod(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  uint32x4_t sum[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h / 2;
+  do {
+    uint8x16_t s0 = vld1q_u8(src_ptr);
+    uint8x16_t r0 = vld1q_u8(ref_ptr);
+    uint8x16_t p0 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg0 = dist_wtd_avg_u8x16(p0, r0, bck_offset, fwd_offset);
+    uint8x16_t diff0 = vabdq_u8(s0, wtd_avg0);
+    sum[0] = vdotq_u32(sum[0], diff0, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+
+    uint8x16_t s1 = vld1q_u8(src_ptr);
+    uint8x16_t r1 = vld1q_u8(ref_ptr);
+    uint8x16_t p1 = vld1q_u8(second_pred);
+    uint8x16_t wtd_avg1 = dist_wtd_avg_u8x16(p1, r1, bck_offset, fwd_offset);
+    uint8x16_t diff1 = vabdq_u8(s1, wtd_avg1);
+    sum[1] = vdotq_u32(sum[1], diff1, vdupq_n_u8(1));
+
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    second_pred += 16;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum[0], sum[1]);
+  return horizontal_add_u32x4(sum[0]);
+}
+
+#define DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(w, h)                               \
+  unsigned int aom_dist_wtd_sad##w##x##h##_avg_neon_dotprod(                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    return dist_wtd_sad##w##xh_avg_neon_dotprod(                              \
+        src, src_stride, ref, ref_stride, (h), second_pred, jcp_param);       \
+  }
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 32)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 16)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 64)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 32)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 128)
+
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 4)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(16, 64)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(32, 8)
+DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SAD_WXH_AVG_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon.c b/third_party/aom/aom_dsp/arm/sadxd_neon.c
new file mode 100644
index 0000000000..e89e1c5a73
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon.c
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint16x8_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vpadalq_u8(*sad_sum, abs_diff);
+}
+
+static INLINE void sadwxhx3d_large_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *const ref[3],
+                                        int ref_stride, uint32_t res[3], int w,
+                                        int h, int h_overflow) {
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  int h_limit = h > h_overflow ? h_overflow : h;
+
+  int ref_offset = 0;
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+    uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s0 = vld1q_u8(src + j);
+        sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+        sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+        sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+        const uint8x16_t s1 = vld1q_u8(src + j + 16);
+        sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+        sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+        sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+        j += 32;
+      } while (j < w);
+
+      src += src_stride;
+      ref_offset += ref_stride;
+    } while (++i < h_limit);
+
+    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+
+    h_limit += h_overflow;
+  } while (i < h);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+static INLINE void sad128xhx3d_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *const ref[3], int ref_stride,
+                                    uint32_t res[3], int h) {
+  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx3d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[3], int ref_stride,
+                                   uint32_t res[3], int h) {
+  sadwxhx3d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx3d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[3], int ref_stride,
+                                   uint32_t res[3], int h) {
+  uint16x8_t sum_lo[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+  uint16x8_t sum_hi[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src);
+    sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+
+    const uint8x16_t s1 = vld1q_u8(src + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_long_add_u16x8(sum_lo[0], sum_hi[0]);
+  res[1] = horizontal_long_add_u16x8(sum_lo[1], sum_hi[1]);
+  res[2] = horizontal_long_add_u16x8(sum_lo[2], sum_hi[2]);
+}
+
+static INLINE void sad16xhx3d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[3], int ref_stride,
+                                   uint32_t res[3], int h) {
+  uint16x8_t sum[3] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad8xhx3d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[3], int ref_stride,
+                                  uint32_t res[3], int h) {
+  uint16x8_t sum[3];
+
+  uint8x8_t s = vld1_u8(src);
+  sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+  sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+  sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+
+  src += src_stride;
+  int ref_offset = ref_stride;
+  int i = h - 1;
+  do {
+    s = vld1_u8(src);
+    sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+    sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+    sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+static INLINE void sad4xhx3d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[3], int ref_stride,
+                                  uint32_t res[3], int h) {
+  assert(h % 2 == 0);
+  uint16x8_t sum[3];
+
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+  uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+  uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+
+  sum[0] = vabdl_u8(s, r0);
+  sum[1] = vabdl_u8(s, r1);
+  sum[2] = vabdl_u8(s, r2);
+
+  src += 2 * src_stride;
+  int ref_offset = 2 * ref_stride;
+  int i = (h / 2) - 1;
+  do {
+    s = load_unaligned_u8(src, src_stride);
+    r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+    r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+    r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+
+    sum[0] = vabal_u8(sum[0], s, r0);
+    sum[1] = vabal_u8(sum[1], s, r1);
+    sum[2] = vabal_u8(sum[2], s, r2);
+
+    src += 2 * src_stride;
+    ref_offset += 2 * ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u16x8(sum[0]);
+  res[1] = horizontal_add_u16x8(sum[1]);
+  res[2] = horizontal_add_u16x8(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON(w, h)                                                  \
+  void aom_sad##w##x##h##x3d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx3d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+  }
+
+SAD_WXH_3D_NEON(4, 4)
+SAD_WXH_3D_NEON(4, 8)
+
+SAD_WXH_3D_NEON(8, 4)
+SAD_WXH_3D_NEON(8, 8)
+SAD_WXH_3D_NEON(8, 16)
+
+SAD_WXH_3D_NEON(16, 8)
+SAD_WXH_3D_NEON(16, 16)
+SAD_WXH_3D_NEON(16, 32)
+
+SAD_WXH_3D_NEON(32, 16)
+SAD_WXH_3D_NEON(32, 32)
+SAD_WXH_3D_NEON(32, 64)
+
+SAD_WXH_3D_NEON(64, 32)
+SAD_WXH_3D_NEON(64, 64)
+SAD_WXH_3D_NEON(64, 128)
+
+SAD_WXH_3D_NEON(128, 64)
+SAD_WXH_3D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON(4, 16)
+SAD_WXH_3D_NEON(8, 32)
+SAD_WXH_3D_NEON(16, 4)
+SAD_WXH_3D_NEON(16, 64)
+SAD_WXH_3D_NEON(32, 8)
+SAD_WXH_3D_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON
+
+static INLINE void sadwxhx4d_large_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *const ref[4],
+                                        int ref_stride, uint32_t res[4], int w,
+                                        int h, int h_overflow) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+  int h_limit = h > h_overflow ? h_overflow : h;
+
+  int ref_offset = 0;
+  int i = 0;
+  do {
+    uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+    uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                             vdupq_n_u16(0) };
+
+    do {
+      int j = 0;
+      do {
+        const uint8x16_t s0 = vld1q_u8(src + j);
+        sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+        sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+        sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+        sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+        const uint8x16_t s1 = vld1q_u8(src + j + 16);
+        sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+        sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+        sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+        sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+        j += 32;
+      } while (j < w);
+
+      src += src_stride;
+      ref_offset += ref_stride;
+    } while (++i < h_limit);
+
+    sum[0] = vpadalq_u16(sum[0], sum_lo[0]);
+    sum[0] = vpadalq_u16(sum[0], sum_hi[0]);
+    sum[1] = vpadalq_u16(sum[1], sum_lo[1]);
+    sum[1] = vpadalq_u16(sum[1], sum_hi[1]);
+    sum[2] = vpadalq_u16(sum[2], sum_lo[2]);
+    sum[2] = vpadalq_u16(sum[2], sum_hi[2]);
+    sum[3] = vpadalq_u16(sum[3], sum_lo[3]);
+    sum[3] = vpadalq_u16(sum[3], sum_hi[3]);
+
+    h_limit += h_overflow;
+  } while (i < h);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *const ref[4], int ref_stride,
+                                    uint32_t res[4], int h) {
+  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 128, h, 32);
+}
+
+static INLINE void sad64xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  sadwxhx4d_large_neon(src, src_stride, ref, ref_stride, res, 64, h, 64);
+}
+
+static INLINE void sad32xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_lo[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+  uint16x8_t sum_hi[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                           vdupq_n_u16(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s0 = vld1q_u8(src);
+    sad16_neon(s0, vld1q_u8(ref[0] + ref_offset), &sum_lo[0]);
+    sad16_neon(s0, vld1q_u8(ref[1] + ref_offset), &sum_lo[1]);
+    sad16_neon(s0, vld1q_u8(ref[2] + ref_offset), &sum_lo[2]);
+    sad16_neon(s0, vld1q_u8(ref[3] + ref_offset), &sum_lo[3]);
+
+    const uint8x16_t s1 = vld1q_u8(src + 16);
+    sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + 16), &sum_hi[0]);
+    sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + 16), &sum_hi[1]);
+    sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + 16), &sum_hi[2]);
+    sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + 16), &sum_hi[3]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_long_add_4d_u16x8(sum_lo, sum_hi));
+}
+
+static INLINE void sad16xhx4d_neon(const uint8_t *src, int src_stride,
+                                   const uint8_t *const ref[4], int ref_stride,
+                                   uint32_t res[4], int h) {
+  uint16x8_t sum_u16[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+                            vdupq_n_u16(0) };
+  uint32x4_t sum_u32[4];
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum_u16[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum_u16[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum_u16[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum_u16[3]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  sum_u32[0] = vpaddlq_u16(sum_u16[0]);
+  sum_u32[1] = vpaddlq_u16(sum_u16[1]);
+  sum_u32[2] = vpaddlq_u16(sum_u16[2]);
+  sum_u32[3] = vpaddlq_u16(sum_u16[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum_u32));
+}
+
+static INLINE void sad8xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4];
+
+  uint8x8_t s = vld1_u8(src);
+  sum[0] = vabdl_u8(s, vld1_u8(ref[0]));
+  sum[1] = vabdl_u8(s, vld1_u8(ref[1]));
+  sum[2] = vabdl_u8(s, vld1_u8(ref[2]));
+  sum[3] = vabdl_u8(s, vld1_u8(ref[3]));
+
+  src += src_stride;
+  int ref_offset = ref_stride;
+  int i = h - 1;
+  do {
+    s = vld1_u8(src);
+    sum[0] = vabal_u8(sum[0], s, vld1_u8(ref[0] + ref_offset));
+    sum[1] = vabal_u8(sum[1], s, vld1_u8(ref[1] + ref_offset));
+    sum[2] = vabal_u8(sum[2], s, vld1_u8(ref[2] + ref_offset));
+    sum[3] = vabal_u8(sum[3], s, vld1_u8(ref[3] + ref_offset));
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+static INLINE void sad4xhx4d_neon(const uint8_t *src, int src_stride,
+                                  const uint8_t *const ref[4], int ref_stride,
+                                  uint32_t res[4], int h) {
+  uint16x8_t sum[4];
+
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r0 = load_unaligned_u8(ref[0], ref_stride);
+  uint8x8_t r1 = load_unaligned_u8(ref[1], ref_stride);
+  uint8x8_t r2 = load_unaligned_u8(ref[2], ref_stride);
+  uint8x8_t r3 = load_unaligned_u8(ref[3], ref_stride);
+
+  sum[0] = vabdl_u8(s, r0);
+  sum[1] = vabdl_u8(s, r1);
+  sum[2] = vabdl_u8(s, r2);
+  sum[3] = vabdl_u8(s, r3);
+
+  src += 2 * src_stride;
+  int ref_offset = 2 * ref_stride;
+  int i = h / 2;
+  while (--i != 0) {
+    s = load_unaligned_u8(src, src_stride);
+    r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
+    r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
+    r2 = load_unaligned_u8(ref[2] + ref_offset, ref_stride);
+    r3 = load_unaligned_u8(ref[3] + ref_offset, ref_stride);
+
+    sum[0] = vabal_u8(sum[0], s, r0);
+    sum[1] = vabal_u8(sum[1], s, r1);
+    sum[2] = vabal_u8(sum[2], s, r2);
+    sum[3] = vabal_u8(sum[3], s, r3);
+
+    src += 2 * src_stride;
+    ref_offset += 2 * ref_stride;
+  }
+
+  vst1q_u32(res, horizontal_add_4d_u16x8(sum));
+}
+
+#define SAD_WXH_4D_NEON(w, h)                                                  \
+  void aom_sad##w##x##h##x4d_neon(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    sad##w##xhx4d_neon(src, src_stride, ref, ref_stride, res, (h));            \
+  }
+
+SAD_WXH_4D_NEON(4, 4)
+SAD_WXH_4D_NEON(4, 8)
+
+SAD_WXH_4D_NEON(8, 4)
+SAD_WXH_4D_NEON(8, 8)
+SAD_WXH_4D_NEON(8, 16)
+
+SAD_WXH_4D_NEON(16, 8)
+SAD_WXH_4D_NEON(16, 16)
+SAD_WXH_4D_NEON(16, 32)
+
+SAD_WXH_4D_NEON(32, 16)
+SAD_WXH_4D_NEON(32, 32)
+SAD_WXH_4D_NEON(32, 64)
+
+SAD_WXH_4D_NEON(64, 32)
+SAD_WXH_4D_NEON(64, 64)
+SAD_WXH_4D_NEON(64, 128)
+
+SAD_WXH_4D_NEON(128, 64)
+SAD_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON(4, 16)
+SAD_WXH_4D_NEON(8, 32)
+SAD_WXH_4D_NEON(16, 4)
+SAD_WXH_4D_NEON(16, 64)
+SAD_WXH_4D_NEON(32, 8)
+SAD_WXH_4D_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON
+
+#define SAD_SKIP_WXH_4D_NEON(w, h)                                          \
+  void aom_sad_skip_##w##x##h##x4d_neon(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx4d_neon(src, 2 * src_stride, ref, 2 * ref_stride, res,       \
+                       ((h) >> 1));                                         \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+SAD_SKIP_WXH_4D_NEON(4, 4)
+SAD_SKIP_WXH_4D_NEON(4, 8)
+
+SAD_SKIP_WXH_4D_NEON(8, 4)
+SAD_SKIP_WXH_4D_NEON(8, 8)
+SAD_SKIP_WXH_4D_NEON(8, 16)
+
+SAD_SKIP_WXH_4D_NEON(16, 8)
+SAD_SKIP_WXH_4D_NEON(16, 16)
+SAD_SKIP_WXH_4D_NEON(16, 32)
+
+SAD_SKIP_WXH_4D_NEON(32, 16)
+SAD_SKIP_WXH_4D_NEON(32, 32)
+SAD_SKIP_WXH_4D_NEON(32, 64)
+
+SAD_SKIP_WXH_4D_NEON(64, 32)
+SAD_SKIP_WXH_4D_NEON(64, 64)
+SAD_SKIP_WXH_4D_NEON(64, 128)
+
+SAD_SKIP_WXH_4D_NEON(128, 64)
+SAD_SKIP_WXH_4D_NEON(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON(4, 16)
+SAD_SKIP_WXH_4D_NEON(8, 32)
+SAD_SKIP_WXH_4D_NEON(16, 4)
+SAD_SKIP_WXH_4D_NEON(16, 64)
+SAD_SKIP_WXH_4D_NEON(32, 8)
+SAD_SKIP_WXH_4D_NEON(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON
diff --git a/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
new file mode 100644
index 0000000000..3d11d1cb96
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sadxd_neon_dotprod.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sad16_neon(uint8x16_t src, uint8x16_t ref,
+                              uint32x4_t *const sad_sum) {
+  uint8x16_t abs_diff = vabdq_u8(src, ref);
+  *sad_sum = vdotq_u32(*sad_sum, abs_diff, vdupq_n_u8(1));
+}
+
+static INLINE void sadwxhx3d_large_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *const ref[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int w, int h) {
+  uint32x4_t sum_lo[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint32x4_t sum_hi[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + j);
+      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+
+      const uint8x16_t s1 = vld1q_u8(src + j + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+
+      j += 32;
+    } while (j < w);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u32x4(vaddq_u32(sum_lo[0], sum_hi[0]));
+  res[1] = horizontal_add_u32x4(vaddq_u32(sum_lo[1], sum_hi[1]));
+  res[2] = horizontal_add_u32x4(vaddq_u32(sum_lo[2], sum_hi[2]));
+}
+
+static INLINE void sad128xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *const ref[4],
+                                            int ref_stride, uint32_t res[4],
+                                            int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx3d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx3d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[3] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  res[0] = horizontal_add_u32x4(sum[0]);
+  res[1] = horizontal_add_u32x4(sum[1]);
+  res[2] = horizontal_add_u32x4(sum[2]);
+}
+
+#define SAD_WXH_3D_NEON_DOTPROD(w, h)                                         \
+  void aom_sad##w##x##h##x3d_neon_dotprod(const uint8_t *src, int src_stride, \
+                                          const uint8_t *const ref[4],        \
+                                          int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx3d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+SAD_WXH_3D_NEON_DOTPROD(16, 8)
+SAD_WXH_3D_NEON_DOTPROD(16, 16)
+SAD_WXH_3D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_3D_NEON_DOTPROD(32, 16)
+SAD_WXH_3D_NEON_DOTPROD(32, 32)
+SAD_WXH_3D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_3D_NEON_DOTPROD(64, 32)
+SAD_WXH_3D_NEON_DOTPROD(64, 64)
+SAD_WXH_3D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_3D_NEON_DOTPROD(128, 64)
+SAD_WXH_3D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_3D_NEON_DOTPROD(16, 4)
+SAD_WXH_3D_NEON_DOTPROD(16, 64)
+SAD_WXH_3D_NEON_DOTPROD(32, 8)
+SAD_WXH_3D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_3D_NEON_DOTPROD
+
+static INLINE void sadwxhx4d_large_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *const ref[4],
+                                                int ref_stride, uint32_t res[4],
+                                                int w, int h) {
+  uint32x4_t sum_lo[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum_hi[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                           vdupq_n_u32(0) };
+  uint32x4_t sum[4];
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      const uint8x16_t s0 = vld1q_u8(src + j);
+      sad16_neon(s0, vld1q_u8(ref[0] + ref_offset + j), &sum_lo[0]);
+      sad16_neon(s0, vld1q_u8(ref[1] + ref_offset + j), &sum_lo[1]);
+      sad16_neon(s0, vld1q_u8(ref[2] + ref_offset + j), &sum_lo[2]);
+      sad16_neon(s0, vld1q_u8(ref[3] + ref_offset + j), &sum_lo[3]);
+
+      const uint8x16_t s1 = vld1q_u8(src + j + 16);
+      sad16_neon(s1, vld1q_u8(ref[0] + ref_offset + j + 16), &sum_hi[0]);
+      sad16_neon(s1, vld1q_u8(ref[1] + ref_offset + j + 16), &sum_hi[1]);
+      sad16_neon(s1, vld1q_u8(ref[2] + ref_offset + j + 16), &sum_hi[2]);
+      sad16_neon(s1, vld1q_u8(ref[3] + ref_offset + j + 16), &sum_hi[3]);
+
+      j += 32;
+    } while (j < w);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  sum[0] = vaddq_u32(sum_lo[0], sum_hi[0]);
+  sum[1] = vaddq_u32(sum_lo[1], sum_hi[1]);
+  sum[2] = vaddq_u32(sum_lo[2], sum_hi[2]);
+  sum[3] = vaddq_u32(sum_lo[3], sum_hi[3]);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+static INLINE void sad128xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *const ref[4],
+                                            int ref_stride, uint32_t res[4],
+                                            int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 128, h);
+}
+
+static INLINE void sad64xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 64, h);
+}
+
+static INLINE void sad32xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  sadwxhx4d_large_neon_dotprod(src, src_stride, ref, ref_stride, res, 32, h);
+}
+
+static INLINE void sad16xhx4d_neon_dotprod(const uint8_t *src, int src_stride,
+                                           const uint8_t *const ref[4],
+                                           int ref_stride, uint32_t res[4],
+                                           int h) {
+  uint32x4_t sum[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                        vdupq_n_u32(0) };
+
+  int ref_offset = 0;
+  int i = h;
+  do {
+    const uint8x16_t s = vld1q_u8(src);
+    sad16_neon(s, vld1q_u8(ref[0] + ref_offset), &sum[0]);
+    sad16_neon(s, vld1q_u8(ref[1] + ref_offset), &sum[1]);
+    sad16_neon(s, vld1q_u8(ref[2] + ref_offset), &sum[2]);
+    sad16_neon(s, vld1q_u8(ref[3] + ref_offset), &sum[3]);
+
+    src += src_stride;
+    ref_offset += ref_stride;
+  } while (--i != 0);
+
+  vst1q_u32(res, horizontal_add_4d_u32x4(sum));
+}
+
+#define SAD_WXH_4D_NEON_DOTPROD(w, h)                                         \
+  void aom_sad##w##x##h##x4d_neon_dotprod(const uint8_t *src, int src_stride, \
+                                          const uint8_t *const ref[4],        \
+                                          int ref_stride, uint32_t res[4]) {  \
+    sad##w##xhx4d_neon_dotprod(src, src_stride, ref, ref_stride, res, (h));   \
+  }
+
+SAD_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_WXH_4D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_WXH_4D_NEON_DOTPROD
+
+#define SAD_SKIP_WXH_4D_NEON_DOTPROD(w, h)                                    \
+  void aom_sad_skip_##w##x##h##x4d_neon_dotprod(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref[4],        \
+      int ref_stride, uint32_t res[4]) {                                      \
+    sad##w##xhx4d_neon_dotprod(src, 2 * src_stride, ref, 2 * ref_stride, res, \
+                               ((h) >> 1));                                   \
+    res[0] <<= 1;                                                             \
+    res[1] <<= 1;                                                             \
+    res[2] <<= 1;                                                             \
+    res[3] <<= 1;                                                             \
+  }
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 32)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 16)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 64)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 32)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 128)
+
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(128, 128)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 4)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(16, 64)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(32, 8)
+SAD_SKIP_WXH_4D_NEON_DOTPROD(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SAD_SKIP_WXH_4D_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/arm/sse_neon.c b/third_party/aom/aom_dsp/arm/sse_neon.c
new file mode 100644
index 0000000000..ec8f0ee183
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon.c
@@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon(const uint8_t *src, const uint8_t *ref,
+                                 uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+  uint8x8_t abs_diff_lo = vget_low_u8(abs_diff);
+  uint8x8_t abs_diff_hi = vget_high_u8(abs_diff);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_lo, abs_diff_lo));
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff_hi, abs_diff_hi));
+}
+
+static INLINE void sse_8x1_neon(const uint8_t *src, const uint8_t *ref,
+                                uint32x4_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE void sse_4x2_neon(const uint8_t *src, int src_stride,
+                                const uint8_t *ref, int ref_stride,
+                                uint32x4_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vpadalq_u16(*sse, vmull_u8(abs_diff, abs_diff));
+}
+
+static INLINE uint32_t sse_wxh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int width, int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        sse_8x1_neon(src + j + src_stride, ref + j + ref_stride, &sse);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon(src + j, src_stride, ref + j, ref_stride, &sse);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon(src + j, ref + j, &sse);
+        j += 8;
+      } while (j < width);
+
+      src += src_stride;
+      ref += ref_stride;
+    } while (--i != 0);
+  }
+  return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_128xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride,
+                                      int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    sse_16x1_neon(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride,
+                                     int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_8x1_neon(src, ref, &sse);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
+static INLINE uint32_t sse_4xh_neon(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    int height) {
+  uint32x4_t sse = vdupq_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(sse);
+}
+
+int64_t aom_sse_neon(const uint8_t *src, int src_stride, const uint8_t *ref,
+                     int ref_stride, int width, int height) {
+  switch (width) {
+    case 4: return sse_4xh_neon(src, src_stride, ref, ref_stride, height);
+    case 8: return sse_8xh_neon(src, src_stride, ref, ref_stride, height);
+    case 16: return sse_16xh_neon(src, src_stride, ref, ref_stride, height);
+    case 32: return sse_32xh_neon(src, src_stride, ref, ref_stride, height);
+    case 64: return sse_64xh_neon(src, src_stride, ref, ref_stride, height);
+    case 128: return sse_128xh_neon(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon(src, src_stride, ref, ref_stride, width, height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
new file mode 100644
index 0000000000..979049780b
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sse_neon_dotprod.c
@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2023, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+
+static INLINE void sse_16x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                         uint32x4_t *sse) {
+  uint8x16_t s = vld1q_u8(src);
+  uint8x16_t r = vld1q_u8(ref);
+
+  uint8x16_t abs_diff = vabdq_u8(s, r);
+
+  *sse = vdotq_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_8x1_neon_dotprod(const uint8_t *src, const uint8_t *ref,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = vld1_u8(src);
+  uint8x8_t r = vld1_u8(ref);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE void sse_4x2_neon_dotprod(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        uint32x2_t *sse) {
+  uint8x8_t s = load_unaligned_u8(src, src_stride);
+  uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+
+  uint8x8_t abs_diff = vabd_u8(s, r);
+
+  *sse = vdot_u32(*sse, abs_diff, abs_diff);
+}
+
+static INLINE uint32_t sse_wxh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int width, int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  if ((width & 0x07) && ((width & 0x07) < 5)) {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j + 4 < width);
+
+      sse_4x2_neon_dotprod(src + j, src_stride, ref + j, ref_stride, &sse[0]);
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  } else {
+    int i = height;
+    do {
+      int j = 0;
+      do {
+        sse_8x1_neon_dotprod(src + j, ref + j, &sse[0]);
+        sse_8x1_neon_dotprod(src + j + src_stride, ref + j + ref_stride,
+                             &sse[1]);
+        j += 8;
+      } while (j < width);
+
+      src += 2 * src_stride;
+      ref += 2 * ref_stride;
+      i -= 2;
+    } while (i != 0);
+  }
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_128xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+    sse_16x1_neon_dotprod(src + 64, ref + 64, &sse[0]);
+    sse_16x1_neon_dotprod(src + 80, ref + 80, &sse[1]);
+    sse_16x1_neon_dotprod(src + 96, ref + 96, &sse[0]);
+    sse_16x1_neon_dotprod(src + 112, ref + 112, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_64xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+    sse_16x1_neon_dotprod(src + 32, ref + 32, &sse[0]);
+    sse_16x1_neon_dotprod(src + 48, ref + 48, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_32xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    sse_16x1_neon_dotprod(src + 16, ref + 16, &sse[1]);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_16xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int height) {
+  uint32x4_t sse[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_16x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_16x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vaddq_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse[2] = { vdup_n_u32(0), vdup_n_u32(0) };
+
+  int i = height;
+  do {
+    sse_8x1_neon_dotprod(src, ref, &sse[0]);
+    src += src_stride;
+    ref += ref_stride;
+    sse_8x1_neon_dotprod(src, ref, &sse[1]);
+    src += src_stride;
+    ref += ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x4(vcombine_u32(sse[0], sse[1]));
+}
+
+static INLINE uint32_t sse_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            int height) {
+  uint32x2_t sse = vdup_n_u32(0);
+
+  int i = height;
+  do {
+    sse_4x2_neon_dotprod(src, src_stride, ref, ref_stride, &sse);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  return horizontal_add_u32x2(sse);
+}
+
+int64_t aom_sse_neon_dotprod(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride, int width,
+                             int height) {
+  switch (width) {
+    case 4:
+      return sse_4xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 8:
+      return sse_8xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 16:
+      return sse_16xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 32:
+      return sse_32xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 64:
+      return sse_64xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    case 128:
+      return sse_128xh_neon_dotprod(src, src_stride, ref, ref_stride, height);
+    default:
+      return sse_wxh_neon_dotprod(src, src_stride, ref, ref_stride, width,
+                                  height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/arm/subpel_variance_neon.c b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
new file mode 100644
index 0000000000..2e6e738853
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subpel_variance_neon.c
@@ -0,0 +1,1103 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/arm/dist_wtd_avg_neon.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                      int src_stride, int pixel_step,
+                                      int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    vst1_u8(dst_ptr, blend_u8);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+  } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_large(const uint8_t *src_ptr,
+                                         uint8_t *dst_ptr, int src_stride,
+                                         int pixel_step, int dst_width,
+                                         int dst_height, int filter_offset) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+      vst1q_u8(dst_ptr + j, blend_u8);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                       int src_stride, int pixel_step,
+                                       int dst_height, int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_bil_w128(const uint8_t *src_ptr,
+                                        uint8_t *dst_ptr, int src_stride,
+                                        int pixel_step, int dst_height,
+                                        int filter_offset) {
+  var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128,
+                               dst_height, filter_offset);
+}
+
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse) {               \
+    uint8_t tmp0[w * (h + padding)];                                     \
+    uint8_t tmp1[w * h];                                                 \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                xoffset);                                \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+  }
+
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                  \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                       \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {               \
+    if (xoffset == 0) {                                                      \
+      if (yoffset == 0) {                                                    \
+        return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);      \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      } else {                                                               \
+        uint8_t tmp[w * h];                                                  \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,     \
+                                    yoffset);                                \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);          \
+      }                                                                      \
+    } else if (xoffset == 4) {                                               \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);              \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * (h + padding)];                                     \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    } else {                                                                 \
+      uint8_t tmp0[w * (h + padding)];                                       \
+      if (yoffset == 0) {                                                    \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);   \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);         \
+      } else if (yoffset == 4) {                                             \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      } else {                                                               \
+        uint8_t tmp1[w * h];                                                 \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \
+                                    xoffset);                                \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);           \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
+      }                                                                      \
+    }                                                                        \
+  }
+
+SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 4.
+static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 4.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w4(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+    uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride);
+    uint8x8_t p = vld1_u8(second_pred);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += 2 * src_stride;
+    dst_ptr += 2 * 4;
+    second_pred += 2 * 4;
+    i -= 2;
+  } while (i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 8.
+static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr,
+                                               uint8_t *dst_ptr, int src_stride,
+                                               int pixel_step, int dst_height,
+                                               int filter_offset,
+                                               const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+
+    uint8x8_t p = vld1_u8(second_pred);
+    uint8x8_t avg = vrhadd_u8(blend_u8, p);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for blocks having
+// width 8.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w8(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x8_t fwd_offset = vdup_n_u8(jcp_param->fwd_offset);
+  const uint8x8_t bck_offset = vdup_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    uint8x8_t s0 = vld1_u8(src_ptr);
+    uint8x8_t s1 = vld1_u8(src_ptr + pixel_step);
+    uint8x8_t p = vld1_u8(second_pred);
+    uint16x8_t blend = vmull_u8(s0, f0);
+    blend = vmlal_u8(blend, s1, f1);
+    uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3);
+    uint8x8_t avg = dist_wtd_avg_u8x8(blend_u8, p, fwd_offset, bck_offset);
+
+    vst1_u8(dst_ptr, avg);
+
+    src_ptr += src_stride;
+    dst_ptr += 8;
+    second_pred += 8;
+  } while (--i > 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for large blocks.
+static void avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred) {
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(blend_u8, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_dist_wtd_comp_avg_pred for large blocks.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_large(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, int filter_offset,
+    const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+  const uint8x8_t f0 = vdup_n_u8(8 - filter_offset);
+  const uint8x8_t f1 = vdup_n_u8(filter_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0);
+      blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1);
+      uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0);
+      blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1);
+      uint8x16_t blend_u8 =
+          vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3));
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = dist_wtd_avg_u8x16(blend_u8, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 16, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 32, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 64, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void avg_pred_var_filter_block2d_bil_w128(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred) {
+  avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride,
+                                        pixel_step, 128, dst_height,
+                                        filter_offset, second_pred);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 16.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w16(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 16, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 32.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w32(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 32, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 64.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w64(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 64, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine bilinear filter with aom_comp_avg_pred for blocks having width 128.
+static void dist_wtd_avg_pred_var_filter_block2d_bil_w128(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_height, int filter_offset, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  dist_wtd_avg_pred_var_filter_block2d_bil_large(
+      src_ptr, dst_ptr, src_stride, pixel_step, 128, dst_height, filter_offset,
+      second_pred, jcp_param);
+}
+
+// Combine averaging subpel filter with aom_comp_avg_pred.
+static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr,
+                                            uint8_t *dst_ptr, int src_stride,
+                                            int pixel_step, int dst_width,
+                                            int dst_height,
+                                            const uint8_t *second_pred) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+
+      uint8x16_t p = vld1q_u8(second_pred);
+      avg = vrhaddq_u8(avg, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Combine averaging subpel filter with aom_dist_wtd_comp_avg_pred.
+static void dist_wtd_avg_pred_var_filter_block2d_avg(
+    const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step,
+    int dst_width, int dst_height, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t p = vld1q_u8(second_pred);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      avg = dist_wtd_avg_u8x16(avg, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of aom_comp_avg_pred for blocks having width >= 16.
+static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride,
+                     int dst_width, int dst_height,
+                     const uint8_t *second_pred) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = vrhaddq_u8(s, p);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+// Implementation of aom_dist_wtd_comp_avg_pred for blocks having width >= 16.
+static void dist_wtd_avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                              int src_stride, int dst_width, int dst_height,
+                              const uint8_t *second_pred,
+                              const DIST_WTD_COMP_PARAMS *jcp_param) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+  const uint8x16_t fwd_offset = vdupq_n_u8(jcp_param->fwd_offset);
+  const uint8x16_t bck_offset = vdupq_n_u8(jcp_param->bck_offset);
+
+  int i = dst_height;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src_ptr + j);
+      uint8x16_t p = vld1q_u8(second_pred);
+
+      uint8x16_t avg = dist_wtd_avg_u8x16(s, p, fwd_offset, bck_offset);
+
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+      second_pred += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+  } while (--i != 0);
+}
+
+#define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred) {                                         \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,      \
+                                         second_pred);                      \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon(                     \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred) {                                            \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        avg_pred(src, tmp, source_stride, w, h, second_pred);                  \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        avg_pred_var_filter_block2d_avg(src, tmp, source_stride,               \
+                                        source_stride, w, h, second_pred);     \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else {                                                                 \
+        avg_pred_var_filter_block2d_bil_w##w(                                  \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred);  \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h,     \
+                                        second_pred);                          \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h,   \
+                                             xoffset, second_pred);            \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred);  \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset,     \
+                                             second_pred);                     \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#define DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)                \
+  unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(         \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,      \
+      const uint8_t *ref, int ref_stride, uint32_t *sse,                    \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {  \
+    uint8_t tmp0[w * (h + padding)];                                        \
+    uint8_t tmp1[w * h];                                                    \
+    var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \
+                                xoffset);                                   \
+    dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                          \
+        tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);              \
+    return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);            \
+  }
+
+#define SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding)       \
+  unsigned int aom_dist_wtd_sub_pixel_avg_variance##w##x##h##_neon(            \
+      const uint8_t *src, int source_stride, int xoffset, int yoffset,         \
+      const uint8_t *ref, int ref_stride, unsigned int *sse,                   \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp[w * h];                                                      \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred(src, tmp, source_stride, w, h, second_pred,          \
+                          jcp_param);                                          \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else if (yoffset == 4) {                                               \
+        dist_wtd_avg_pred_var_filter_block2d_avg(src, tmp, source_stride,      \
+                                                 source_stride, w, h,          \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      } else {                                                                 \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            src, tmp, source_stride, source_stride, h, yoffset, second_pred,   \
+            jcp_param);                                                        \
+        return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse);            \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred_var_filter_block2d_avg(                              \
+            src, tmp0, source_stride, 1, w, h, second_pred, jcp_param);        \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            src, tmp0, source_stride, 1, h, xoffset, second_pred, jcp_param);  \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        dist_wtd_avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h,       \
+                                                 second_pred, jcp_param);      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1,               \
+                                    (h + padding), xoffset);                   \
+        dist_wtd_avg_pred_var_filter_block2d_bil_w##w(                         \
+            tmp0, tmp1, w, w, h, yoffset, second_pred, jcp_param);             \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1)
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1)
+
+#if !CONFIG_REALTIME_ONLY
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1)
+
+DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1)
+
+SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+#undef SPECIALIZED_DIST_WTD_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#if !CONFIG_REALTIME_ONLY
+
+#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(            \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,    \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {   \
+    uint8_t tmp0[w * (h + padding)];                                   \
+    uint8_t tmp1[w * h];                                               \
+    var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+                                xoffset);                              \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
+    return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse);       \
+  }
+
+#define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)              \
+  unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(                   \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,           \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {          \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \
+                                                 sse);                        \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h);       \
+        return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h,      \
+                                    yoffset);                                 \
+        return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h);               \
+        return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding);     \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      if (yoffset == 0) {                                                     \
+        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset);    \
+        return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
+                                    xoffset);                                 \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding,    \
+                                    xoffset);                                 \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+#undef OBMC_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint8_t tmp0[w * (h + padding)];                                           \
+    uint8_t tmp1[w * h];                                                       \
+    uint8_t tmp2[w * h];                                                       \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),       \
+                                xoffset);                                      \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);                 \
+    aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
+                            invert_mask);                                      \
+    return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);               \
+  }
+
+#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)             \
+  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp0[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h);       \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h,      \
+                                    yoffset);                                  \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);                \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    } else {                                                                   \
+      if (yoffset == 0) {                                                      \
+        uint8_t tmp0[w * h];                                                   \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);     \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);           \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp0[w * (h + padding)];                                       \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
+                                    xoffset);                                  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
+      } else {                                                                 \
+        uint8_t tmp0[w * (h + padding)];                                       \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
+                                    xoffset);                                  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse);           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef MASKED_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
diff --git a/third_party/aom/aom_dsp/arm/subtract_neon.c b/third_party/aom/aom_dsp/arm/subtract_neon.c
new file mode 100644
index 0000000000..a195c40d19
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/subtract_neon.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_neon(int rows, int cols, int16_t *diff,
+                             ptrdiff_t diff_stride, const uint8_t *src,
+                             ptrdiff_t src_stride, const uint8_t *pred,
+                             ptrdiff_t pred_stride) {
+  if (cols > 16) {
+    int r = rows;
+    do {
+      int c = 0;
+      do {
+        const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+        const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+        const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+        const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+        const uint16x8_t v_diff_lo_00 =
+            vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
+        const uint16x8_t v_diff_hi_00 =
+            vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
+        const uint16x8_t v_diff_lo_16 =
+            vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
+        const uint16x8_t v_diff_hi_16 =
+            vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+        vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+        vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+        c += 32;
+      } while (c < cols);
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else if (cols > 8) {
+    int r = rows;
+    do {
+      const uint8x16_t v_src = vld1q_u8(&src[0]);
+      const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+      const uint16x8_t v_diff_lo =
+          vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
+      const uint16x8_t v_diff_hi =
+          vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else if (cols > 4) {
+    int r = rows;
+    do {
+      const uint8x8_t v_src = vld1_u8(&src[0]);
+      const uint8x8_t v_pred = vld1_u8(&pred[0]);
+      const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else {
+    int r = rows;
+    do {
+      int c = 0;
+      do {
+        diff[c] = src[c] - pred[c];
+      } while (++c < cols);
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  if (cols > 16) {
+    int r = rows;
+    do {
+      int c = 0;
+      do {
+        const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]);
+        const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]);
+        const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+        const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]);
+        const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]);
+        const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+        vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00));
+        vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08));
+        c += 16;
+      } while (c < cols);
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else if (cols > 8) {
+    int r = rows;
+    do {
+      const uint16x8_t v_src_00 = vld1q_u16(&src[0]);
+      const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]);
+      const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00);
+      const uint16x8_t v_src_08 = vld1q_u16(&src[8]);
+      const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]);
+      const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00));
+      vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08));
+      diff += diff_stride;
+      pred += pred_stride;
+      src += src_stride;
+    } while (--r != 0);
+  } else if (cols > 4) {
+    int r = rows;
+    do {
+      const uint16x8_t v_src_r0 = vld1q_u16(&src[0]);
+      const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]);
+      const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]);
+      const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]);
+      const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0);
+      const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1);
+      vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0));
+      vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1));
+      diff += diff_stride << 1;
+      pred += pred_stride << 1;
+      src += src_stride << 1;
+      r -= 2;
+    } while (r != 0);
+  } else {
+    int r = rows;
+    do {
+      const uint16x4_t v_src_r0 = vld1_u16(&src[0]);
+      const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]);
+      const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]);
+      const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]);
+      const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0);
+      const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1);
+      vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0));
+      vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1));
+      diff += diff_stride << 1;
+      pred += pred_stride << 1;
+      src += src_stride << 1;
+      r -= 2;
+    } while (r != 0);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/arm/sum_neon.h b/third_party/aom/aom_dsp/arm/sum_neon.h
new file mode 100644
index 0000000000..30a108e70a
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_neon.h
@@ -0,0 +1,311 @@
+/*
+ *  Copyright (c) 2019, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_SUM_NEON_H_
+#define AOM_AOM_DSP_ARM_SUM_NEON_H_
+
+#include "config/aom_dsp_rtcd.h"
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+static INLINE int horizontal_add_u8x8(const uint8x8_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlv_u8(a);
+#else
+  uint16x4_t b = vpaddl_u8(a);
+  uint32x2_t c = vpaddl_u16(b);
+  return vget_lane_u32(c, 0) + vget_lane_u32(c, 1);
+#endif
+}
+
+static INLINE int horizontal_add_s16x8(const int16x8_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_s16(a);
+#else
+  const int32x4_t b = vpaddlq_s16(a);
+  const int64x2_t c = vpaddlq_s32(b);
+  const int32x2_t d = vadd_s32(vreinterpret_s32_s64(vget_low_s64(c)),
+                               vreinterpret_s32_s64(vget_high_s64(c)));
+  return vget_lane_s32(d, 0);
+#endif
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+#endif
+}
+
+static INLINE int64_t horizontal_add_s64x2(const int64x2_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddvq_s64(a);
+#else
+  return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_add_u64x2(const uint64x2_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddvq_u64(a);
+#else
+  return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  return vgetq_lane_u64(b, 0) + vgetq_lane_u64(b, 1);
+#endif
+}
+
+static INLINE int64_t horizontal_long_add_s32x4(const int32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_s32(a);
+#else
+  const int64x2_t b = vpaddlq_s32(a);
+  return vgetq_lane_s64(b, 0) + vgetq_lane_s64(b, 1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x4(const uint32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddvq_u32(a);
+#else
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u32x4(const uint32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+  uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
+  uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
+  return vpaddq_u32(res01, res23);
+#else
+  uint32x4_t res = vdupq_n_u32(0);
+  res = vsetq_lane_u32(horizontal_add_u32x4(sum[0]), res, 0);
+  res = vsetq_lane_u32(horizontal_add_u32x4(sum[1]), res, 1);
+  res = vsetq_lane_u32(horizontal_add_u32x4(sum[2]), res, 2);
+  res = vsetq_lane_u32(horizontal_add_u32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s32x4(const int32x4_t sum[4]) {
+#if AOM_ARCH_AARCH64
+  int32x4_t res01 = vpaddq_s32(sum[0], sum[1]);
+  int32x4_t res23 = vpaddq_s32(sum[2], sum[3]);
+  return vpaddq_s32(res01, res23);
+#else
+  int32x4_t res = vdupq_n_s32(0);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[0]), res, 0);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[1]), res, 1);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[2]), res, 2);
+  res = vsetq_lane_s32(horizontal_add_s32x4(sum[3]), res, 3);
+  return res;
+#endif
+}
+
+static INLINE uint32_t horizontal_long_add_u16x8(const uint16x8_t vec_lo,
+                                                 const uint16x8_t vec_hi) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
+#else
+  const uint32x4_t vec_l_lo =
+      vaddl_u16(vget_low_u16(vec_lo), vget_high_u16(vec_lo));
+  const uint32x4_t vec_l_hi =
+      vaddl_u16(vget_low_u16(vec_hi), vget_high_u16(vec_hi));
+  const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi);
+  const uint64x2_t b = vpaddlq_u32(a);
+  const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
+                                vreinterpret_u32_u64(vget_high_u64(b)));
+  return vget_lane_u32(c, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_long_add_4d_u16x8(
+    const uint16x8_t sum_lo[4], const uint16x8_t sum_hi[4]) {
+  const uint32x4_t a0 = vpaddlq_u16(sum_lo[0]);
+  const uint32x4_t a1 = vpaddlq_u16(sum_lo[1]);
+  const uint32x4_t a2 = vpaddlq_u16(sum_lo[2]);
+  const uint32x4_t a3 = vpaddlq_u16(sum_lo[3]);
+  const uint32x4_t b0 = vpadalq_u16(a0, sum_hi[0]);
+  const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
+  const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
+  const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
+#if AOM_ARCH_AARCH64
+  const uint32x4_t c0 = vpaddq_u32(b0, b1);
+  const uint32x4_t c1 = vpaddq_u32(b2, b3);
+  return vpaddq_u32(c0, c1);
+#else
+  const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+  const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+  const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+  const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+  const uint32x2_t d0 = vpadd_u32(c0, c1);
+  const uint32x2_t d1 = vpadd_u32(c2, c3);
+  return vcombine_u32(d0, d1);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x8(const uint16x8_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlvq_u16(a);
+#else
+  const uint32x4_t b = vpaddlq_u16(a);
+  const uint64x2_t c = vpaddlq_u32(b);
+  const uint32x2_t d = vadd_u32(vreinterpret_u32_u64(vget_low_u64(c)),
+                                vreinterpret_u32_u64(vget_high_u64(c)));
+  return vget_lane_u32(d, 0);
+#endif
+}
+
+static INLINE uint32x4_t horizontal_add_4d_u16x8(const uint16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+  const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
+  const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
+  const uint16x8_t b0 = vpaddq_u16(a0, a1);
+  return vpaddlq_u16(b0);
+#else
+  const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+  const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+  const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+  const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+  const uint16x4_t b0 = vpadd_u16(a0, a1);
+  const uint16x4_t b1 = vpadd_u16(a2, a3);
+  return vpaddlq_u16(vcombine_u16(b0, b1));
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) {
+#if AOM_ARCH_AARCH64
+  const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]);
+  const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]);
+  const int16x8_t b0 = vpaddq_s16(a0, a1);
+  return vpaddlq_s16(b0);
+#else
+  const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0]));
+  const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1]));
+  const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2]));
+  const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3]));
+  const int16x4_t b0 = vpadd_s16(a0, a1);
+  const int16x4_t b1 = vpadd_s16(a2, a3);
+  return vpaddlq_s16(vcombine_s16(b0, b1));
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u32(vreinterpret_u32_u64(b), 0);
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x2(const uint32x2_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlv_u32(a);
+#else
+  const uint64x1_t b = vpaddl_u32(a);
+  return vget_lane_u64(b, 0);
+#endif
+}
+
+static INLINE uint32_t horizontal_add_u16x4(const uint16x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vaddlv_u16(a);
+#else
+  const uint32x2_t b = vpaddl_u16(a);
+  const uint64x1_t c = vpaddl_u32(b);
+  return vget_lane_u32(vreinterpret_u32_u64(c), 0);
+#endif
+}
+
+static INLINE int32x4_t horizontal_add_2d_s32(int32x4_t a, int32x4_t b) {
+#if AOM_ARCH_AARCH64
+  return vpaddq_s32(a, b);
+#else
+  const int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+  const int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
+  return vcombine_s32(a0, b0);
+#endif
+}
+
+static INLINE int32x2_t add_pairwise_s32x4(int32x4_t a) {
+#if AOM_ARCH_AARCH64
+  return vget_low_s32(vpaddq_s32(a, a));
+#else
+  return vpadd_s32(vget_low_s32(a), vget_high_s32(a));
+#endif
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x2(const uint32x4_t a[2]) {
+  return horizontal_long_add_u32x4(a[0]) + horizontal_long_add_u32x4(a[1]);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x4(const uint32x4_t a[4]) {
+  uint64x2_t sum = vpaddlq_u32(a[0]);
+  sum = vpadalq_u32(sum, a[1]);
+  sum = vpadalq_u32(sum, a[2]);
+  sum = vpadalq_u32(sum, a[3]);
+
+  return horizontal_add_u64x2(sum);
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x8(const uint32x4_t a[8]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+
+  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+static INLINE uint64_t horizontal_long_add_u32x4_x16(const uint32x4_t a[16]) {
+  uint64x2_t sum[2];
+  sum[0] = vpaddlq_u32(a[0]);
+  sum[1] = vpaddlq_u32(a[1]);
+  sum[0] = vpadalq_u32(sum[0], a[2]);
+  sum[1] = vpadalq_u32(sum[1], a[3]);
+  sum[0] = vpadalq_u32(sum[0], a[4]);
+  sum[1] = vpadalq_u32(sum[1], a[5]);
+  sum[0] = vpadalq_u32(sum[0], a[6]);
+  sum[1] = vpadalq_u32(sum[1], a[7]);
+  sum[0] = vpadalq_u32(sum[0], a[8]);
+  sum[1] = vpadalq_u32(sum[1], a[9]);
+  sum[0] = vpadalq_u32(sum[0], a[10]);
+  sum[1] = vpadalq_u32(sum[1], a[11]);
+  sum[0] = vpadalq_u32(sum[0], a[12]);
+  sum[1] = vpadalq_u32(sum[1], a[13]);
+  sum[0] = vpadalq_u32(sum[0], a[14]);
+  sum[1] = vpadalq_u32(sum[1], a[15]);
+
+  return horizontal_add_u64x2(vaddq_u64(sum[0], sum[1]));
+}
+
+#endif  // AOM_AOM_DSP_ARM_SUM_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon.c b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
new file mode 100644
index 0000000000..424b2b4445
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,574 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src,
+                                                       int stride) {
+  int16x4_t s0 = vld1_s16(src + 0 * stride);
+  int16x4_t s1 = vld1_s16(src + 1 * stride);
+  int16x4_t s2 = vld1_s16(src + 2 * stride);
+  int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+  int32x4_t sum_squares = vmull_s16(s0, s0);
+  sum_squares = vmlal_s16(sum_squares, s1, s1);
+  sum_squares = vmlal_s16(sum_squares, s2, s2);
+  sum_squares = vmlal_s16(sum_squares, s3, s3);
+
+  return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sum_squares));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src,
+                                                       int stride, int height) {
+  int32x4_t sum_squares[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int h = height;
+  do {
+    int16x4_t s0 = vld1_s16(src + 0 * stride);
+    int16x4_t s1 = vld1_s16(src + 1 * stride);
+    int16x4_t s2 = vld1_s16(src + 2 * stride);
+    int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+    sum_squares[0] = vmlal_s16(sum_squares[0], s0, s0);
+    sum_squares[0] = vmlal_s16(sum_squares[0], s1, s1);
+    sum_squares[1] = vmlal_s16(sum_squares[1], s2, s2);
+    sum_squares[1] = vmlal_s16(sum_squares[1], s3, s3);
+
+    src += 4 * stride;
+    h -= 4;
+  } while (h != 0);
+
+  return horizontal_long_add_u32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sum_squares[0], sum_squares[1])));
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src,
+                                                       int stride, int width,
+                                                       int height) {
+  uint64x2_t sum_squares = vdupq_n_u64(0);
+
+  int h = height;
+  do {
+    int32x4_t ss_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    int w = 0;
+    do {
+      const int16_t *s = src + w;
+      int16x8_t s0 = vld1q_s16(s + 0 * stride);
+      int16x8_t s1 = vld1q_s16(s + 1 * stride);
+      int16x8_t s2 = vld1q_s16(s + 2 * stride);
+      int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s0), vget_low_s16(s0));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s1), vget_low_s16(s1));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s2), vget_low_s16(s2));
+      ss_row[0] = vmlal_s16(ss_row[0], vget_low_s16(s3), vget_low_s16(s3));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s0), vget_high_s16(s0));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s1), vget_high_s16(s1));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s2), vget_high_s16(s2));
+      ss_row[1] = vmlal_s16(ss_row[1], vget_high_s16(s3), vget_high_s16(s3));
+      w += 8;
+    } while (w < width);
+
+    sum_squares = vpadalq_u32(
+        sum_squares, vreinterpretq_u32_s32(vaddq_s32(ss_row[0], ss_row[1])));
+
+    src += 4 * stride;
+    h -= 4;
+  } while (h != 0);
+
+  return horizontal_add_u64x2(sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an SIMD register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_neon(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4x4_neon(const int16_t *src,
+                                                   int stride, int *sum) {
+  int16x4_t s0 = vld1_s16(src + 0 * stride);
+  int16x4_t s1 = vld1_s16(src + 1 * stride);
+  int16x4_t s2 = vld1_s16(src + 2 * stride);
+  int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+  int32x4_t sse = vmull_s16(s0, s0);
+  sse = vmlal_s16(sse, s1, s1);
+  sse = vmlal_s16(sse, s2, s2);
+  sse = vmlal_s16(sse, s3, s3);
+
+  int32x4_t sum_01 = vaddl_s16(s0, s1);
+  int32x4_t sum_23 = vaddl_s16(s2, s3);
+  *sum += horizontal_add_s32x4(vaddq_s32(sum_01, sum_23));
+
+  return horizontal_long_add_u32x4(vreinterpretq_u32_s32(sse));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xn_neon(const int16_t *src,
+                                                   int stride, int height,
+                                                   int *sum) {
+  int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+  int32x2_t sum_acc[2] = { vdup_n_s32(0), vdup_n_s32(0) };
+
+  int h = height;
+  do {
+    int16x4_t s0 = vld1_s16(src + 0 * stride);
+    int16x4_t s1 = vld1_s16(src + 1 * stride);
+    int16x4_t s2 = vld1_s16(src + 2 * stride);
+    int16x4_t s3 = vld1_s16(src + 3 * stride);
+
+    sse[0] = vmlal_s16(sse[0], s0, s0);
+    sse[0] = vmlal_s16(sse[0], s1, s1);
+    sse[1] = vmlal_s16(sse[1], s2, s2);
+    sse[1] = vmlal_s16(sse[1], s3, s3);
+
+    sum_acc[0] = vpadal_s16(sum_acc[0], s0);
+    sum_acc[0] = vpadal_s16(sum_acc[0], s1);
+    sum_acc[1] = vpadal_s16(sum_acc[1], s2);
+    sum_acc[1] = vpadal_s16(sum_acc[1], s3);
+
+    src += 4 * stride;
+    h -= 4;
+  } while (h != 0);
+
+  *sum += horizontal_add_s32x4(vcombine_s32(sum_acc[0], sum_acc[1]));
+  return horizontal_long_add_u32x4(
+      vreinterpretq_u32_s32(vaddq_s32(sse[0], sse[1])));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_nxn_neon(const int16_t *src,
+                                                   int stride, int width,
+                                                   int height, int *sum) {
+  uint64x2_t sse = vdupq_n_u64(0);
+  int32x4_t sum_acc = vdupq_n_s32(0);
+
+  int h = height;
+  do {
+    int32x4_t sse_row[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+    int w = 0;
+    do {
+      const int16_t *s = src + w;
+      int16x8_t s0 = vld1q_s16(s + 0 * stride);
+      int16x8_t s1 = vld1q_s16(s + 1 * stride);
+      int16x8_t s2 = vld1q_s16(s + 2 * stride);
+      int16x8_t s3 = vld1q_s16(s + 3 * stride);
+
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s0), vget_low_s16(s0));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s1), vget_low_s16(s1));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s2), vget_low_s16(s2));
+      sse_row[0] = vmlal_s16(sse_row[0], vget_low_s16(s3), vget_low_s16(s3));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s0), vget_high_s16(s0));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s1), vget_high_s16(s1));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s2), vget_high_s16(s2));
+      sse_row[1] = vmlal_s16(sse_row[1], vget_high_s16(s3), vget_high_s16(s3));
+
+      sum_acc = vpadalq_s16(sum_acc, s0);
+      sum_acc = vpadalq_s16(sum_acc, s1);
+      sum_acc = vpadalq_s16(sum_acc, s2);
+      sum_acc = vpadalq_s16(sum_acc, s3);
+
+      w += 8;
+    } while (w < width);
+
+    sse = vpadalq_u32(sse,
+                      vreinterpretq_u32_s32(vaddq_s32(sse_row[0], sse_row[1])));
+
+    src += 4 * stride;
+    h -= 4;
+  } while (h != 0);
+
+  *sum += horizontal_add_s32x4(sum_acc);
+  return horizontal_add_u64x2(sse);
+}
+
+uint64_t aom_sum_sse_2d_i16_neon(const int16_t *src, int stride, int width,
+                                 int height, int *sum) {
+  uint64_t sse;
+
+  if (LIKELY(width == 4 && height == 4)) {
+    sse = aom_sum_sse_2d_i16_4x4_neon(src, stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    // width = 4, height is a multiple of 4.
+    sse = aom_sum_sse_2d_i16_4xn_neon(src, stride, height, sum);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case - width is multiple of 8, height is multiple of 4.
+    sse = aom_sum_sse_2d_i16_nxn_neon(src, stride, width, height, sum);
+  } else {
+    sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+  }
+
+  return sse;
+}
+
+static INLINE uint64_t aom_sum_squares_i16_4xn_neon(const int16_t *src,
+                                                    uint32_t n) {
+  uint64x2_t sum_u64 = vdupq_n_u64(0);
+
+  int i = n;
+  do {
+    uint32x4_t sum;
+    int16x4_t s0 = vld1_s16(src);
+
+    sum = vreinterpretq_u32_s32(vmull_s16(s0, s0));
+
+    sum_u64 = vpadalq_u32(sum_u64, sum);
+
+    src += 4;
+    i -= 4;
+  } while (i >= 4);
+
+  if (i > 0) {
+    return horizontal_add_u64x2(sum_u64) + aom_sum_squares_i16_c(src, i);
+  }
+  return horizontal_add_u64x2(sum_u64);
+}
+
+static INLINE uint64_t aom_sum_squares_i16_8xn_neon(const int16_t *src,
+                                                    uint32_t n) {
+  uint64x2_t sum_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  int i = n;
+  do {
+    uint32x4_t sum[2];
+    int16x8_t s0 = vld1q_s16(src);
+
+    sum[0] =
+        vreinterpretq_u32_s32(vmull_s16(vget_low_s16(s0), vget_low_s16(s0)));
+    sum[1] =
+        vreinterpretq_u32_s32(vmull_s16(vget_high_s16(s0), vget_high_s16(s0)));
+
+    sum_u64[0] = vpadalq_u32(sum_u64[0], sum[0]);
+    sum_u64[1] = vpadalq_u32(sum_u64[1], sum[1]);
+
+    src += 8;
+    i -= 8;
+  } while (i >= 8);
+
+  if (i > 0) {
+    return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1])) +
+           aom_sum_squares_i16_c(src, i);
+  }
+  return horizontal_add_u64x2(vaddq_u64(sum_u64[0], sum_u64[1]));
+}
+
+uint64_t aom_sum_squares_i16_neon(const int16_t *src, uint32_t n) {
+  // This function seems to be called only for values of N >= 64. See
+  // av1/encoder/compound_type.c.
+  if (LIKELY(n >= 8)) {
+    return aom_sum_squares_i16_8xn_neon(src, n);
+  }
+  if (n >= 4) {
+    return aom_sum_squares_i16_4xn_neon(src, n);
+  }
+  return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon(uint8_t *src, int src_stride,
+                                              int width, int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+  // element before we need to accumulate to 32-bit elements. Since we're
+  // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+  // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+  // * 256) / width.
+  int h_limit = (4 * 256) / width;
+  int h_tmp = height > h_limit ? h_limit : height;
+
+  int h = 0;
+  do {
+    uint16x4_t sum_u16 = vdup_n_u16(0);
+    do {
+      uint8_t *src_ptr = src;
+      int w = width;
+      do {
+        uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+        sum_u16 = vpadal_u8(sum_u16, s0);
+
+        uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+        sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+        src_ptr += 8;
+        w -= 8;
+      } while (w >= 8);
+
+      // Process remaining columns in the row using C.
+      while (w > 0) {
+        int idx = width - w;
+        const uint8_t v = src[idx];
+        sum += v;
+        sse += v * v;
+        w--;
+      }
+
+      src += 2 * src_stride;
+      h += 2;
+    } while (h < h_tmp && h < height);
+
+    sum_u32 = vpadal_u16(sum_u32, sum_u16);
+    h_tmp += h_limit;
+  } while (h < height);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x4(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon(uint8_t *src, int src_stride,
+                                              int width, int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+  // element before we need to accumulate to 32-bit elements. Since we're
+  // accumulating in uint16x4_t vectors, this means we can accumulate up to 4
+  // rows of 256 elements. Therefore the limit can be computed as: h_limit = (4
+  // * 256) / width.
+  int h_limit = (4 * 256) / width;
+  int h_tmp = height > h_limit ? h_limit : height;
+
+  int h = 0;
+  do {
+    uint16x4_t sum_u16 = vdup_n_u16(0);
+    do {
+      uint8_t *src_ptr = src;
+      int w = width;
+      do {
+        uint8x8_t s0 = vld1_u8(src_ptr);
+
+        sum_u16 = vpadal_u8(sum_u16, s0);
+
+        uint16x8_t sse_u16 = vmull_u8(s0, s0);
+
+        sse_u32 = vpadalq_u16(sse_u32, sse_u16);
+
+        src_ptr += 8;
+        w -= 8;
+      } while (w >= 8);
+
+      // Process remaining columns in the row using C.
+      while (w > 0) {
+        int idx = width - w;
+        const uint8_t v = src[idx];
+        sum += v;
+        sse += v * v;
+        w--;
+      }
+
+      src += src_stride;
+      ++h;
+    } while (h < h_tmp && h < height);
+
+    sum_u32 = vpadal_u16(sum_u32, sum_u16);
+    h_tmp += h_limit;
+  } while (h < height);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x4(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon(uint8_t *src, int src_stride,
+                                               int width, int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  // 255*256 = 65280, so we can accumulate up to 256 8-bit elements in a 16-bit
+  // element before we need to accumulate to 32-bit elements. Since we're
+  // accumulating in uint16x8_t vectors, this means we can accumulate up to 8
+  // rows of 256 elements. Therefore the limit can be computed as: h_limit = (8
+  // * 256) / width.
+  int h_limit = (8 * 256) / width;
+  int h_tmp = height > h_limit ? h_limit : height;
+
+  int h = 0;
+  do {
+    uint16x8_t sum_u16 = vdupq_n_u16(0);
+    do {
+      int w = width;
+      uint8_t *src_ptr = src;
+      do {
+        uint8x16_t s0 = vld1q_u8(src_ptr);
+
+        sum_u16 = vpadalq_u8(sum_u16, s0);
+
+        uint16x8_t sse_u16_lo = vmull_u8(vget_low_u8(s0), vget_low_u8(s0));
+        uint16x8_t sse_u16_hi = vmull_u8(vget_high_u8(s0), vget_high_u8(s0));
+
+        sse_u32[0] = vpadalq_u16(sse_u32[0], sse_u16_lo);
+        sse_u32[1] = vpadalq_u16(sse_u32[1], sse_u16_hi);
+
+        src_ptr += 16;
+        w -= 16;
+      } while (w >= 16);
+
+      // Process remaining columns in the row using C.
+      while (w > 0) {
+        int idx = width - w;
+        const uint8_t v = src[idx];
+        sum += v;
+        sse += v * v;
+        w--;
+      }
+
+      src += src_stride;
+      ++h;
+    } while (h < h_tmp && h < height);
+
+    sum_u32 = vpadalq_u16(sum_u32, sum_u16);
+    h_tmp += h_limit;
+  } while (h < height);
+
+  sum += horizontal_long_add_u32x4(sum_u32);
+  sse += horizontal_long_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+
+  return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon(uint8_t *src, int src_stride, int width,
+                            int height) {
+  if (width >= 16) {
+    return aom_var_2d_u8_16xh_neon(src, src_stride, width, height);
+  }
+  if (width >= 8) {
+    return aom_var_2d_u8_8xh_neon(src, src_stride, width, height);
+  }
+  if (width >= 4 && height % 2 == 0) {
+    return aom_var_2d_u8_4xh_neon(src, src_stride, width, height);
+  }
+  return aom_var_2d_u8_c(src, src_stride, width, height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_neon(uint8_t *src, int src_stride,
+                                               int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint16_t *src_ptr = src_u16;
+    do {
+      uint16x4_t s0 = vld1_u16(src_ptr);
+
+      sum_u32 = vpadal_u16(sum_u32, s0);
+
+      uint32x4_t sse_u32 = vmull_u16(s0, s0);
+
+      sse_u64 = vpadalq_u32(sse_u64, sse_u32);
+
+      src_ptr += 4;
+      w -= 4;
+    } while (w >= 4);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint16_t v = src_u16[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src_u16 += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_add_u64x2(sse_u64);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_neon(uint8_t *src, int src_stride,
+                                               int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  int h = height;
+  do {
+    int w = width;
+    uint16_t *src_ptr = src_u16;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr);
+
+      sum_u32 = vpadalq_u16(sum_u32, s0);
+
+      uint32x4_t sse_u32_lo = vmull_u16(vget_low_u16(s0), vget_low_u16(s0));
+      uint32x4_t sse_u32_hi = vmull_u16(vget_high_u16(s0), vget_high_u16(s0));
+
+      sse_u64[0] = vpadalq_u32(sse_u64[0], sse_u32_lo);
+      sse_u64[1] = vpadalq_u32(sse_u64[1], sse_u32_hi);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w >= 8);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint16_t v = src_u16[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src_u16 += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x4(sum_u32);
+  sse += horizontal_add_u64x2(vaddq_u64(sse_u64[0], sse_u64[1]));
+
+  return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_neon(uint8_t *src, int src_stride, int width,
+                             int height) {
+  if (width >= 8) {
+    return aom_var_2d_u16_8xh_neon(src, src_stride, width, height);
+  }
+  if (width >= 4) {
+    return aom_var_2d_u16_4xh_neon(src, src_stride, width, height);
+  }
+  return aom_var_2d_u16_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
new file mode 100644
index 0000000000..44462a693c
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_neon_dotprod.c
@@ -0,0 +1,154 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_var_2d_u8_4xh_neon_dotprod(uint8_t *src,
+                                                      int src_stride, int width,
+                                                      int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x2_t sse_u32 = vdup_n_u32(0);
+
+  int h = height / 2;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride);
+
+      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+      sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w >= 8);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += 2 * src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x2(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_8xh_neon_dotprod(uint8_t *src,
+                                                      int src_stride, int width,
+                                                      int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x2_t sum_u32 = vdup_n_u32(0);
+  uint32x2_t sse_u32 = vdup_n_u32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x8_t s0 = vld1_u8(src_ptr);
+
+      sum_u32 = vdot_u32(sum_u32, s0, vdup_n_u8(1));
+
+      sse_u32 = vdot_u32(sse_u32, s0, s0);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w >= 8);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x2(sum_u32);
+  sse += horizontal_long_add_u32x2(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u8_16xh_neon_dotprod(uint8_t *src,
+                                                       int src_stride,
+                                                       int width, int height) {
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint8_t *src_ptr = src;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+
+      sum_u32 = vdotq_u32(sum_u32, s0, vdupq_n_u8(1));
+
+      sse_u32 = vdotq_u32(sse_u32, s0, s0);
+
+      src_ptr += 16;
+      w -= 16;
+    } while (w >= 16);
+
+    // Process remaining columns in the row using C.
+    while (w > 0) {
+      int idx = width - w;
+      const uint8_t v = src[idx];
+      sum += v;
+      sse += v * v;
+      w--;
+    }
+
+    src += src_stride;
+  } while (--h != 0);
+
+  sum += horizontal_long_add_u32x4(sum_u32);
+  sse += horizontal_long_add_u32x4(sse_u32);
+
+  return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u8_neon_dotprod(uint8_t *src, int src_stride, int width,
+                                    int height) {
+  if (width >= 16) {
+    return aom_var_2d_u8_16xh_neon_dotprod(src, src_stride, width, height);
+  }
+  if (width >= 8) {
+    return aom_var_2d_u8_8xh_neon_dotprod(src, src_stride, width, height);
+  }
+  if (width >= 4 && height % 2 == 0) {
+    return aom_var_2d_u8_4xh_neon_dotprod(src, src_stride, width, height);
+  }
+  return aom_var_2d_u8_c(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/sum_squares_sve.c b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
new file mode 100644
index 0000000000..724e43859e
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/sum_squares_sve.c
@@ -0,0 +1,402 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/arm/dot_sve.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src,
+                                                      int stride, int height) {
+  int64x2_t sum_squares = vdupq_n_s64(0);
+
+  do {
+    int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+    sum_squares = aom_sdotq_s16(sum_squares, s, s);
+
+    src += 2 * stride;
+    height -= 2;
+  } while (height != 0);
+
+  return (uint64_t)vaddvq_s64(sum_squares);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src,
+                                                      int stride, int height) {
+  int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    int16x8_t s0 = vld1q_s16(src + 0 * stride);
+    int16x8_t s1 = vld1q_s16(src + 1 * stride);
+
+    sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+    sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+    src += 2 * stride;
+    height -= 2;
+  } while (height != 0);
+
+  sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+  return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src,
+                                                        int stride, int width,
+                                                        int height) {
+  int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+
+  do {
+    const int16_t *src_ptr = src;
+    int w = width;
+    do {
+      int16x8_t s0 = vld1q_s16(src_ptr);
+      int16x8_t s1 = vld1q_s16(src_ptr + 8);
+
+      sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0);
+      sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1);
+
+      src_ptr += 16;
+      w -= 16;
+    } while (w != 0);
+
+    src += stride;
+  } while (--height != 0);
+
+  sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]);
+  return (uint64_t)vaddvq_s64(sum_squares[0]);
+}
+
+static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src,
+                                                      int stride, int width,
+                                                      int height) {
+  svint64_t sum_squares = svdup_n_s64(0);
+  uint64_t step = svcnth();
+
+  do {
+    const int16_t *src_ptr = src;
+    int w = 0;
+    do {
+      svbool_t pred = svwhilelt_b16_u32(w, width);
+      svint16_t s0 = svld1_s16(pred, src_ptr);
+
+      sum_squares = svdot_s64(sum_squares, s0, s0);
+
+      src_ptr += step;
+      w += step;
+    } while (w < width);
+
+    src += stride;
+  } while (--height != 0);
+
+  return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares);
+}
+
+uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width,
+                                    int height) {
+  if (width == 4) {
+    return aom_sum_squares_2d_i16_4xh_sve(src, stride, height);
+  }
+  if (width == 8) {
+    return aom_sum_squares_2d_i16_8xh_sve(src, stride, height);
+  }
+  if (width % 16 == 0) {
+    return aom_sum_squares_2d_i16_large_sve(src, stride, width, height);
+  }
+  return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height);
+}
+
+uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) {
+  // This function seems to be called only for values of N >= 64. See
+  // av1/encoder/compound_type.c. Additionally, because N = width x height for
+  // width and height between the standard block sizes, N will also be a
+  // multiple of 64.
+  if (LIKELY(n % 64 == 0)) {
+    int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0),
+                         vdupq_n_s64(0) };
+
+    do {
+      int16x8_t s0 = vld1q_s16(src);
+      int16x8_t s1 = vld1q_s16(src + 8);
+      int16x8_t s2 = vld1q_s16(src + 16);
+      int16x8_t s3 = vld1q_s16(src + 24);
+
+      sum[0] = aom_sdotq_s16(sum[0], s0, s0);
+      sum[1] = aom_sdotq_s16(sum[1], s1, s1);
+      sum[2] = aom_sdotq_s16(sum[2], s2, s2);
+      sum[3] = aom_sdotq_s16(sum[3], s3, s3);
+
+      src += 32;
+      n -= 32;
+    } while (n != 0);
+
+    sum[0] = vaddq_s64(sum[0], sum[1]);
+    sum[2] = vaddq_s64(sum[2], sum[3]);
+    sum[0] = vaddq_s64(sum[0], sum[2]);
+    return vaddvq_s64(sum[0]);
+  }
+  return aom_sum_squares_i16_c(src, n);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src,
+                                                  int stride, int height,
+                                                  int *sum) {
+  int64x2_t sse = vdupq_n_s64(0);
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+
+  do {
+    int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride));
+
+    sse = aom_sdotq_s16(sse, s, s);
+
+    sum_s32 = vpadalq_s16(sum_s32, s);
+
+    src += 2 * stride;
+    height -= 2;
+  } while (height != 0);
+
+  *sum += vaddvq_s32(sum_s32);
+  return vaddvq_s64(sse);
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src,
+                                                  int stride, int height,
+                                                  int *sum) {
+  int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+  int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  do {
+    int16x8_t s0 = vld1q_s16(src);
+    int16x8_t s1 = vld1q_s16(src + stride);
+
+    sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+    sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+    sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+    sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+    src += 2 * stride;
+    height -= 2;
+  } while (height != 0);
+
+  *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+  return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src,
+                                                   int stride, int width,
+                                                   int height, int *sum) {
+  int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) };
+  int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  do {
+    int w = 0;
+    do {
+      int16x8_t s0 = vld1q_s16(src + w);
+      int16x8_t s1 = vld1q_s16(src + w + 8);
+
+      sse[0] = aom_sdotq_s16(sse[0], s0, s0);
+      sse[1] = aom_sdotq_s16(sse[1], s1, s1);
+
+      sum_acc[0] = vpadalq_s16(sum_acc[0], s0);
+      sum_acc[1] = vpadalq_s16(sum_acc[1], s1);
+
+      w += 16;
+    } while (w < width);
+
+    src += stride;
+  } while (--height != 0);
+
+  *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1]));
+  return vaddvq_s64(vaddq_s64(sse[0], sse[1]));
+}
+
+uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width,
+                                int height, int *sum) {
+  uint64_t sse;
+
+  if (width == 4) {
+    sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum);
+  } else if (width == 8) {
+    sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum);
+  } else if (width % 16 == 0) {
+    sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum);
+  } else {
+    sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum);
+  }
+
+  return sse;
+}
+
+static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride,
+                                              int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  int h = height;
+  do {
+    uint16x8_t s0 =
+        vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride));
+
+    sum_u32 = vpadalq_u16(sum_u32, s0);
+
+    sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+    src_u16 += 2 * src_stride;
+    h -= 2;
+  } while (h != 0);
+
+  sum += vaddlvq_u32(sum_u32);
+  sse += vaddvq_u64(sse_u64);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride,
+                                              int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32 = vdupq_n_u32(0);
+  uint64x2_t sse_u64 = vdupq_n_u64(0);
+
+  int h = height;
+  do {
+    int w = width;
+    uint16_t *src_ptr = src_u16;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr);
+
+      sum_u32 = vpadalq_u16(sum_u32, s0);
+
+      sse_u64 = aom_udotq_u16(sse_u64, s0, s0);
+
+      src_ptr += 8;
+      w -= 8;
+    } while (w != 0);
+
+    src_u16 += src_stride;
+  } while (--h != 0);
+
+  sum += vaddlvq_u32(sum_u32);
+  sse += vaddvq_u64(sse_u64);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride,
+                                               int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+  uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) };
+
+  int h = height;
+  do {
+    int w = width;
+    uint16_t *src_ptr = src_u16;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr);
+      uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+
+      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+      sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+
+      sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+      sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+
+      src_ptr += 16;
+      w -= 16;
+    } while (w != 0);
+
+    src_u16 += src_stride;
+  } while (--h != 0);
+
+  sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+  sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+
+  sum += vaddlvq_u32(sum_u32[0]);
+  sse += vaddvq_u64(sse_u64[0]);
+
+  return sse - sum * sum / (width * height);
+}
+
+static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride,
+                                                int width, int height) {
+  uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src);
+  uint64_t sum = 0;
+  uint64_t sse = 0;
+  uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0),
+                            vdupq_n_u32(0) };
+  uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0),
+                            vdupq_n_u64(0) };
+
+  int h = height;
+  do {
+    int w = width;
+    uint16_t *src_ptr = src_u16;
+    do {
+      uint16x8_t s0 = vld1q_u16(src_ptr);
+      uint16x8_t s1 = vld1q_u16(src_ptr + 8);
+      uint16x8_t s2 = vld1q_u16(src_ptr + 16);
+      uint16x8_t s3 = vld1q_u16(src_ptr + 24);
+
+      sum_u32[0] = vpadalq_u16(sum_u32[0], s0);
+      sum_u32[1] = vpadalq_u16(sum_u32[1], s1);
+      sum_u32[2] = vpadalq_u16(sum_u32[2], s2);
+      sum_u32[3] = vpadalq_u16(sum_u32[3], s3);
+
+      sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0);
+      sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1);
+      sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2);
+      sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3);
+
+      src_ptr += 32;
+      w -= 32;
+    } while (w != 0);
+
+    src_u16 += src_stride;
+  } while (--h != 0);
+
+  sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]);
+  sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]);
+  sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]);
+  sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]);
+  sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]);
+  sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]);
+
+  sum += vaddlvq_u32(sum_u32[0]);
+  sse += vaddvq_u64(sse_u64[0]);
+
+  return sse - sum * sum / (width * height);
+}
+
+uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width,
+                            int height) {
+  if (width == 4) {
+    return aom_var_2d_u16_4xh_sve(src, src_stride, width, height);
+  }
+  if (width == 8) {
+    return aom_var_2d_u16_8xh_sve(src, src_stride, width, height);
+  }
+  if (width == 16) {
+    return aom_var_2d_u16_16xh_sve(src, src_stride, width, height);
+  }
+  if (width % 32 == 0) {
+    return aom_var_2d_u16_large_sve(src, src_stride, width, height);
+  }
+  return aom_var_2d_u16_neon(src, src_stride, width, height);
+}
diff --git a/third_party/aom/aom_dsp/arm/transpose_neon.h b/third_party/aom/aom_dsp/arm/transpose_neon.h
new file mode 100644
index 0000000000..8027018235
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/transpose_neon.h
@@ -0,0 +1,1263 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+#define AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"  // For AOM_FORCE_INLINE.
+#include "config/aom_config.h"
+
+static INLINE void transpose_elems_u8_8x8(
+    uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4,
+    uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1,
+    uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6,
+    uint8x8_t *o7) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16  40 50 42 52 44 54 46 56
+  // b0.val[1]: 01 11 03 13 05 15 07 17  41 51 43 53 45 55 47 57
+  // b1.val[0]: 20 30 22 32 24 34 26 36  60 70 62 72 64 74 66 76
+  // b1.val[1]: 21 31 23 33 25 35 27 37  61 71 63 73 65 75 67 77
+
+  const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5));
+  const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34  40 50 60 70 44 54 64 74
+  // c0.val[1]: 02 12 22 32 06 16 26 36  42 52 62 72 46 56 66 76
+  // c1.val[0]: 01 11 21 31 05 15 25 35  41 51 61 71 45 55 65 75
+  // c1.val[1]: 03 13 23 33 07 17 27 37  43 53 63 73 47 57 67 77
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  // Unzip 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // d0.val[1]: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // d1.val[0]: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // d1.val[1]: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const uint32x4x2_t d0 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0]));
+  *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0]));
+  *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0]));
+  *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0]));
+  *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1]));
+  *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1]));
+  *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1]));
+  *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1]));
+}
+
+static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1,
+                                                  uint8x8_t *a2, uint8x8_t *a3,
+                                                  uint8x8_t *a4, uint8x8_t *a5,
+                                                  uint8x8_t *a6,
+                                                  uint8x8_t *a7) {
+  transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3,
+                         a4, a5, a6, a7);
+}
+
+static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in,
+                                           uint8x8_t *out) {
+  transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7],
+                         &out[0], &out[1], &out[2], &out[3], &out[4], &out[5],
+                         &out[6], &out[7]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x,
+                                                      uint8x16_t *d) {
+  uint8x8x2_t w0 = vzip_u8(x[0], x[1]);
+  uint8x8x2_t w1 = vzip_u8(x[2], x[3]);
+  uint8x8x2_t w2 = vzip_u8(x[4], x[5]);
+  uint8x8x2_t w3 = vzip_u8(x[6], x[7]);
+
+  uint8x8x2_t w8 = vzip_u8(x[8], x[9]);
+  uint8x8x2_t w9 = vzip_u8(x[10], x[11]);
+  uint8x8x2_t w10 = vzip_u8(x[12], x[13]);
+  uint8x8x2_t w11 = vzip_u8(x[14], x[15]);
+
+  uint16x4x2_t w4 =
+      vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]));
+  uint16x4x2_t w5 =
+      vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0]));
+  uint16x4x2_t w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0]));
+  uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]),
+                              vreinterpret_u16_u8(w11.val[0]));
+
+  uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                             vreinterpret_u32_u16(w5.val[0]));
+  uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                             vreinterpret_u32_u16(w5.val[1]));
+  uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                              vreinterpret_u32_u16(w13.val[0]));
+  uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                              vreinterpret_u32_u16(w13.val[1]));
+
+  // Store first 4-line result
+  d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+  d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+  d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+  d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+
+  w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]));
+  w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1]));
+  w12 =
+      vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1]));
+  w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]),
+                 vreinterpret_u16_u8(w11.val[1]));
+
+  w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]),
+                vreinterpret_u32_u16(w5.val[0]));
+  w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]),
+                vreinterpret_u32_u16(w5.val[1]));
+  w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]),
+                 vreinterpret_u32_u16(w13.val[0]));
+  w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]),
+                 vreinterpret_u32_u16(w13.val[1]));
+
+  // Store second 4-line result
+  d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0]));
+  d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1]));
+  d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0]));
+  d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1]));
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x,
+                                                      uint8x8_t *d) {
+  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+  uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+  uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+  uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                              vreinterpretq_u16_u8(w1.val[0]));
+  uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                              vreinterpretq_u16_u8(w3.val[0]));
+  uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                              vreinterpretq_u16_u8(w1.val[1]));
+  uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                              vreinterpretq_u16_u8(w3.val[1]));
+
+  uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]),
+                              vreinterpretq_u32_u16(w5.val[0]));
+  uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]),
+                              vreinterpretq_u32_u16(w7.val[0]));
+  uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]),
+                               vreinterpretq_u32_u16(w5.val[1]));
+  uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]),
+                               vreinterpretq_u32_u16(w7.val[1]));
+
+  d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0]));
+  d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0]));
+  d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1]));
+  d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1]));
+  d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0]));
+  d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0]));
+  d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1]));
+  d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1]));
+  d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0]));
+  d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0]));
+  d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1]));
+  d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1]));
+  d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0]));
+  d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0]));
+  d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1]));
+  d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1]));
+}
+
+static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
+  uint16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_u16_u64(
+      vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+  b0.val[1] = vreinterpretq_u16_u64(
+      vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
+#else
+  b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)),
+                           vreinterpret_u16_u32(vget_low_u32(a1)));
+  b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)),
+                           vreinterpret_u16_u32(vget_high_u32(a1)));
+#endif
+  return b0;
+}
+
+static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x,
+                                             uint8x16_t *d) {
+  uint8x16x2_t w0 = vzipq_u8(x[0], x[1]);
+  uint8x16x2_t w1 = vzipq_u8(x[2], x[3]);
+  uint8x16x2_t w2 = vzipq_u8(x[4], x[5]);
+  uint8x16x2_t w3 = vzipq_u8(x[6], x[7]);
+
+  uint8x16x2_t w4 = vzipq_u8(x[8], x[9]);
+  uint8x16x2_t w5 = vzipq_u8(x[10], x[11]);
+  uint8x16x2_t w6 = vzipq_u8(x[12], x[13]);
+  uint8x16x2_t w7 = vzipq_u8(x[14], x[15]);
+
+  uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]),
+                              vreinterpretq_u16_u8(w1.val[0]));
+  uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]),
+                              vreinterpretq_u16_u8(w3.val[0]));
+  uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]),
+                               vreinterpretq_u16_u8(w5.val[0]));
+  uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]),
+                               vreinterpretq_u16_u8(w7.val[0]));
+
+  uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                               vreinterpretq_u32_u16(w9.val[0]));
+  uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                               vreinterpretq_u32_u16(w11.val[0]));
+  uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                               vreinterpretq_u32_u16(w9.val[1]));
+  uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                               vreinterpretq_u32_u16(w11.val[1]));
+
+  uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+  d[0] = vreinterpretq_u8_u16(d01.val[0]);
+  d[1] = vreinterpretq_u8_u16(d01.val[1]);
+  uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+  d[2] = vreinterpretq_u8_u16(d23.val[0]);
+  d[3] = vreinterpretq_u8_u16(d23.val[1]);
+  uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+  d[4] = vreinterpretq_u8_u16(d45.val[0]);
+  d[5] = vreinterpretq_u8_u16(d45.val[1]);
+  uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+  d[6] = vreinterpretq_u8_u16(d67.val[0]);
+  d[7] = vreinterpretq_u8_u16(d67.val[1]);
+
+  // upper half
+  w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]),
+                 vreinterpretq_u16_u8(w1.val[1]));
+  w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]),
+                 vreinterpretq_u16_u8(w3.val[1]));
+  w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]),
+                  vreinterpretq_u16_u8(w5.val[1]));
+  w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]),
+                  vreinterpretq_u16_u8(w7.val[1]));
+
+  w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]),
+                  vreinterpretq_u32_u16(w9.val[0]));
+  w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]),
+                  vreinterpretq_u32_u16(w11.val[0]));
+  w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]),
+                  vreinterpretq_u32_u16(w9.val[1]));
+  w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]),
+                  vreinterpretq_u32_u16(w11.val[1]));
+
+  d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]);
+  d[8] = vreinterpretq_u8_u16(d01.val[0]);
+  d[9] = vreinterpretq_u8_u16(d01.val[1]);
+  d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]);
+  d[10] = vreinterpretq_u8_u16(d23.val[0]);
+  d[11] = vreinterpretq_u8_u16(d23.val[1]);
+  d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]);
+  d[12] = vreinterpretq_u8_u16(d45.val[0]);
+  d[13] = vreinterpretq_u8_u16(d45.val[1]);
+  d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]);
+  d[14] = vreinterpretq_u8_u16(d67.val[0]);
+  d[15] = vreinterpretq_u8_u16(d67.val[1]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x,
+                                                       uint8x16_t *d) {
+  uint8x16_t x2[32];
+  for (int i = 0; i < 16; ++i) {
+    x2[i] = x[i].val[0];
+    x2[i + 16] = x[i].val[1];
+  }
+  transpose_arrays_u8_16x16(x2, d);
+  transpose_arrays_u8_16x16(x2 + 16, d + 16);
+}
+
+static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1,
+                                                  uint8x8_t *a2,
+                                                  uint8x8_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const uint8x8x2_t b0 = vtrn_u8(*a0, *a1);
+  const uint8x8x2_t b1 = vtrn_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint16x4x2_t c0 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[0]), vreinterpret_u16_u8(b1.val[0]));
+  const uint16x4x2_t c1 =
+      vtrn_u16(vreinterpret_u16_u8(b0.val[1]), vreinterpret_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpret_u8_u16(c0.val[0]);
+  *a1 = vreinterpret_u8_u16(c1.val[0]);
+  *a2 = vreinterpret_u8_u16(c0.val[1]);
+  *a3 = vreinterpret_u8_u16(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
+                                                  uint8x8_t *a1) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03  10 11 12 13
+  // a1: 20 21 22 23  30 31 32 33
+  // to:
+  // b0.val[0]: 00 01 20 21  10 11 30 31
+  // b0.val[1]: 02 03 22 23  12 13 32 33
+
+  const uint16x4x2_t b0 =
+      vtrn_u16(vreinterpret_u16_u8(*a0), vreinterpret_u16_u8(*a1));
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21  02 03 22 23
+  // c0.val[1]: 10 11 30 31  12 13 32 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b0.val[1]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30  02 12 22 32
+  // d0.val[1]: 01 11 21 31  03 13 23 33
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u32(c0.val[0]), vreinterpret_u8_u32(c0.val[1]));
+
+  *a0 = d0.val[0];
+  *a1 = d0.val[1];
+}
+
+static INLINE void transpose_elems_u8_4x8(uint8x8_t a0, uint8x8_t a1,
+                                          uint8x8_t a2, uint8x8_t a3,
+                                          uint8x8_t a4, uint8x8_t a5,
+                                          uint8x8_t a6, uint8x8_t a7,
+                                          uint8x8_t *o0, uint8x8_t *o1,
+                                          uint8x8_t *o2, uint8x8_t *o3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03 XX XX XX XX
+  // a1: 10 11 12 13 XX XX XX XX
+  // a2: 20 21 22 23 XX XX XX XX
+  // a3; 30 31 32 33 XX XX XX XX
+  // a4: 40 41 42 43 XX XX XX XX
+  // a5: 50 51 52 53 XX XX XX XX
+  // a6: 60 61 62 63 XX XX XX XX
+  // a7: 70 71 72 73 XX XX XX XX
+  // to:
+  // b0.val[0]: 00 01 02 03 40 41 42 43
+  // b1.val[0]: 10 11 12 13 50 51 52 53
+  // b2.val[0]: 20 21 22 23 60 61 62 63
+  // b3.val[0]: 30 31 32 33 70 71 72 73
+
+  const uint32x2x2_t b0 =
+      vtrn_u32(vreinterpret_u32_u8(a0), vreinterpret_u32_u8(a4));
+  const uint32x2x2_t b1 =
+      vtrn_u32(vreinterpret_u32_u8(a1), vreinterpret_u32_u8(a5));
+  const uint32x2x2_t b2 =
+      vtrn_u32(vreinterpret_u32_u8(a2), vreinterpret_u32_u8(a6));
+  const uint32x2x2_t b3 =
+      vtrn_u32(vreinterpret_u32_u8(a3), vreinterpret_u32_u8(a7));
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 01 20 21 40 41 60 61
+  // c0.val[1]: 02 03 22 23 42 43 62 63
+  // c1.val[0]: 10 11 30 31 50 51 70 71
+  // c1.val[1]: 12 13 32 33 52 53 72 73
+
+  const uint16x4x2_t c0 = vtrn_u16(vreinterpret_u16_u32(b0.val[0]),
+                                   vreinterpret_u16_u32(b2.val[0]));
+  const uint16x4x2_t c1 = vtrn_u16(vreinterpret_u16_u32(b1.val[0]),
+                                   vreinterpret_u16_u32(b3.val[0]));
+
+  // Swap 8 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 01 11 21 31 41 51 61 71
+  // d1.val[0]: 02 12 22 32 42 52 62 72
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint8x8x2_t d0 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[0]), vreinterpret_u8_u16(c1.val[0]));
+  const uint8x8x2_t d1 =
+      vtrn_u8(vreinterpret_u8_u16(c0.val[1]), vreinterpret_u8_u16(c1.val[1]));
+
+  *o0 = d0.val[0];
+  *o1 = d0.val[1];
+  *o2 = d1.val[0];
+  *o3 = d1.val[1];
+}
+
+static INLINE void transpose_array_inplace_u16_4x4(uint16x4_t a[4]) {
+  // Input:
+  // 00 01 02 03
+  // 10 11 12 13
+  // 20 21 22 23
+  // 30 31 32 33
+
+  // b:
+  // 00 10 02 12
+  // 01 11 03 13
+  const uint16x4x2_t b = vtrn_u16(a[0], a[1]);
+  // c:
+  // 20 30 22 32
+  // 21 31 23 33
+  const uint16x4x2_t c = vtrn_u16(a[2], a[3]);
+  // d:
+  // 00 10 20 30
+  // 02 12 22 32
+  const uint32x2x2_t d =
+      vtrn_u32(vreinterpret_u32_u16(b.val[0]), vreinterpret_u32_u16(c.val[0]));
+  // e:
+  // 01 11 21 31
+  // 03 13 23 33
+  const uint32x2x2_t e =
+      vtrn_u32(vreinterpret_u32_u16(b.val[1]), vreinterpret_u32_u16(c.val[1]));
+
+  // Output:
+  // 00 10 20 30
+  // 01 11 21 31
+  // 02 12 22 32
+  // 03 13 23 33
+  a[0] = vreinterpret_u16_u32(d.val[0]);
+  a[1] = vreinterpret_u16_u32(e.val[0]);
+  a[2] = vreinterpret_u16_u32(d.val[1]);
+  a[3] = vreinterpret_u16_u32(e.val[1]);
+}
+
+static INLINE void transpose_array_inplace_u16_4x8(uint16x8_t a[4]) {
+  // 4x8 Input:
+  // a[0]: 00 01 02 03 04 05 06 07
+  // a[1]: 10 11 12 13 14 15 16 17
+  // a[2]: 20 21 22 23 24 25 26 27
+  // a[3]: 30 31 32 33 34 35 36 37
+
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+
+  // 8x4 Output:
+  // a[0]: 00 10 20 30 04 14 24 34
+  // a[1]: 01 11 21 31 05 15 25 35
+  // a[2]: 02 12 22 32 06 16 26 36
+  // a[3]: 03 13 23 33 07 17 27 37
+  a[0] = vreinterpretq_u16_u32(c0.val[0]);
+  a[1] = vreinterpretq_u16_u32(c1.val[0]);
+  a[2] = vreinterpretq_u16_u32(c0.val[1]);
+  a[3] = vreinterpretq_u16_u32(c1.val[1]);
+}
+
+// Special transpose for loop filter.
+// 4x8 Input:
+// p_q:  p3 p2 p1 p0 q0 q1 q2 q3
+// a[0]: 00 01 02 03 04 05 06 07
+// a[1]: 10 11 12 13 14 15 16 17
+// a[2]: 20 21 22 23 24 25 26 27
+// a[3]: 30 31 32 33 34 35 36 37
+// 8x4 Output:
+// a[0]: 03 13 23 33 04 14 24 34  p0q0
+// a[1]: 02 12 22 32 05 15 25 35  p1q1
+// a[2]: 01 11 21 31 06 16 26 36  p2q2
+// a[3]: 00 10 20 30 07 17 27 37  p3q3
+// Direct reapplication of the function will reset the high halves, but
+// reverse the low halves:
+// p_q:  p0 p1 p2 p3 q0 q1 q2 q3
+// a[0]: 33 32 31 30 04 05 06 07
+// a[1]: 23 22 21 20 14 15 16 17
+// a[2]: 13 12 11 10 24 25 26 27
+// a[3]: 03 02 01 00 34 35 36 37
+// Simply reordering the inputs (3, 2, 1, 0) will reset the low halves, but
+// reverse the high halves.
+// The standard transpose_u16_4x8q will produce the same reversals, but with the
+// order of the low halves also restored relative to the high halves. This is
+// preferable because it puts all values from the same source row back together,
+// but some post-processing is inevitable.
+static INLINE void loop_filter_transpose_u16_4x8q(uint16x8_t a[4]) {
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  const uint16x8x2_t b0 = vtrnq_u16(a[0], a[1]);
+  const uint16x8x2_t b1 = vtrnq_u16(a[2], a[3]);
+
+  // Reverse odd vectors to bring the appropriate items to the front of zips.
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // r0       : 03 13 01 11 07 17 05 15
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // r1       : 23 33 21 31 27 37 25 35
+  const uint32x4_t r0 = vrev64q_u32(vreinterpretq_u32_u16(b0.val[1]));
+  const uint32x4_t r1 = vrev64q_u32(vreinterpretq_u32_u16(b1.val[1]));
+
+  // Zip to complete the halves.
+  // c0.val[0]: 00 10 20 30 02 12 22 32  p3p1
+  // c0.val[1]: 04 14 24 34 06 16 26 36  q0q2
+  // c1.val[0]: 03 13 23 33 01 11 21 31  p0p2
+  // c1.val[1]: 07 17 27 37 05 15 25 35  q3q1
+  const uint32x4x2_t c0 = vzipq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vzipq_u32(r0, r1);
+
+  // d0.val[0]: 00 10 20 30 07 17 27 37  p3q3
+  // d0.val[1]: 02 12 22 32 05 15 25 35  p1q1
+  // d1.val[0]: 03 13 23 33 04 14 24 34  p0q0
+  // d1.val[1]: 01 11 21 31 06 16 26 36  p2q2
+  const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c1.val[1]);
+  // The third row of c comes first here to swap p2 with q0.
+  const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c0.val[1]);
+
+  // 8x4 Output:
+  // a[0]: 03 13 23 33 04 14 24 34  p0q0
+  // a[1]: 02 12 22 32 05 15 25 35  p1q1
+  // a[2]: 01 11 21 31 06 16 26 36  p2q2
+  // a[3]: 00 10 20 30 07 17 27 37  p3q3
+  a[0] = d1.val[0];  // p0q0
+  a[1] = d0.val[1];  // p1q1
+  a[2] = d1.val[1];  // p2q2
+  a[3] = d0.val[0];  // p3q3
+}
+
+static INLINE void transpose_elems_u16_4x8(
+    const uint16x4_t a0, const uint16x4_t a1, const uint16x4_t a2,
+    const uint16x4_t a3, const uint16x4_t a4, const uint16x4_t a5,
+    const uint16x4_t a6, const uint16x4_t a7, uint16x8_t *o0, uint16x8_t *o1,
+    uint16x8_t *o2, uint16x8_t *o3) {
+  // Combine rows. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
+
+  const uint16x8_t b0 = vcombine_u16(a0, a4);
+  const uint16x8_t b1 = vcombine_u16(a1, a5);
+  const uint16x8_t b2 = vcombine_u16(a2, a6);
+  const uint16x8_t b3 = vcombine_u16(a3, a7);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
+
+  const uint16x8x2_t c0 = vtrnq_u16(b0, b1);
+  const uint16x8x2_t c1 = vtrnq_u16(b2, b3);
+
+  // Swap 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const uint32x4x2_t d0 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[0]),
+                                    vreinterpretq_u32_u16(c1.val[0]));
+  const uint32x4x2_t d1 = vtrnq_u32(vreinterpretq_u32_u16(c0.val[1]),
+                                    vreinterpretq_u32_u16(c1.val[1]));
+
+  *o0 = vreinterpretq_u16_u32(d0.val[0]);
+  *o1 = vreinterpretq_u16_u32(d1.val[0]);
+  *o2 = vreinterpretq_u16_u32(d0.val[1]);
+  *o3 = vreinterpretq_u16_u32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_s16_4x8(
+    const int16x4_t a0, const int16x4_t a1, const int16x4_t a2,
+    const int16x4_t a3, const int16x4_t a4, const int16x4_t a5,
+    const int16x4_t a6, const int16x4_t a7, int16x8_t *o0, int16x8_t *o1,
+    int16x8_t *o2, int16x8_t *o3) {
+  // Combine rows. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // a4: 40 41 42 43
+  // a5: 50 51 52 53
+  // a6: 60 61 62 63
+  // a7: 70 71 72 73
+  // to:
+  // b0: 00 01 02 03 40 41 42 43
+  // b1: 10 11 12 13 50 51 52 53
+  // b2: 20 21 22 23 60 61 62 63
+  // b3: 30 31 32 33 70 71 72 73
+
+  const int16x8_t b0 = vcombine_s16(a0, a4);
+  const int16x8_t b1 = vcombine_s16(a1, a5);
+  const int16x8_t b2 = vcombine_s16(a2, a6);
+  const int16x8_t b3 = vcombine_s16(a3, a7);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 02 12 40 50 42 52
+  // c0.val[1]: 01 11 03 13 41 51 43 53
+  // c1.val[0]: 20 30 22 32 60 70 62 72
+  // c1.val[1]: 21 31 23 33 61 71 63 73
+
+  const int16x8x2_t c0 = vtrnq_s16(b0, b1);
+  const int16x8x2_t c1 = vtrnq_s16(b2, b3);
+
+  // Swap 32 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 02 12 22 32 42 52 62 72
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 03 13 23 33 43 53 63 73
+
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+
+  *o0 = vreinterpretq_s16_s32(d0.val[0]);
+  *o1 = vreinterpretq_s16_s32(d1.val[0]);
+  *o2 = vreinterpretq_s16_s32(d0.val[1]);
+  *o3 = vreinterpretq_s16_s32(d1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_u16_8x8(
+    uint16x8_t *a0, uint16x8_t *a1, uint16x8_t *a2, uint16x8_t *a3,
+    uint16x8_t *a4, uint16x8_t *a5, uint16x8_t *a6, uint16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const uint16x8x2_t b0 = vtrnq_u16(*a0, *a1);
+  const uint16x8x2_t b1 = vtrnq_u16(*a2, *a3);
+  const uint16x8x2_t b2 = vtrnq_u16(*a4, *a5);
+  const uint16x8x2_t b3 = vtrnq_u16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[0]),
+                                    vreinterpretq_u32_u16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_u16(b0.val[1]),
+                                    vreinterpretq_u32_u16(b1.val[1]));
+  const uint32x4x2_t c2 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[0]),
+                                    vreinterpretq_u32_u16(b3.val[0]));
+  const uint32x4x2_t c3 = vtrnq_u32(vreinterpretq_u32_u16(b2.val[1]),
+                                    vreinterpretq_u32_u16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const uint16x8x2_t d0 = aom_vtrnq_u64_to_u16(c0.val[0], c2.val[0]);
+  const uint16x8x2_t d1 = aom_vtrnq_u64_to_u16(c1.val[0], c3.val[0]);
+  const uint16x8x2_t d2 = aom_vtrnq_u64_to_u16(c0.val[1], c2.val[1]);
+  const uint16x8x2_t d3 = aom_vtrnq_u64_to_u16(c1.val[1], c3.val[1]);
+
+  *a0 = d0.val[0];
+  *a1 = d1.val[0];
+  *a2 = d2.val[0];
+  *a3 = d3.val[0];
+  *a4 = d0.val[1];
+  *a5 = d1.val[1];
+  *a6 = d2.val[1];
+  *a7 = d3.val[1];
+}
+
+static INLINE int16x8x2_t aom_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
+  int16x8x2_t b0;
+#if AOM_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s16_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s16_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+  b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)),
+                           vreinterpret_s16_s32(vget_low_s32(a1)));
+  b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)),
+                           vreinterpret_s16_s32(vget_high_s32(a1)));
+#endif
+  return b0;
+}
+
+static INLINE void transpose_elems_inplace_s16_8x8(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2, int16x8_t *a3,
+                                                   int16x8_t *a4, int16x8_t *a5,
+                                                   int16x8_t *a6,
+                                                   int16x8_t *a7) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+  const int16x8x2_t b2 = vtrnq_s16(*a4, *a5);
+  const int16x8x2_t b3 = vtrnq_s16(*a6, *a7);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  *a0 = d0.val[0];
+  *a1 = d1.val[0];
+  *a2 = d2.val[0];
+  *a3 = d3.val[0];
+  *a4 = d0.val[1];
+  *a5 = d1.val[1];
+  *a6 = d2.val[1];
+  *a7 = d3.val[1];
+}
+
+static INLINE void transpose_arrays_s16_8x8(const int16x8_t *a,
+                                            int16x8_t *out) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // a4: 40 41 42 43 44 45 46 47
+  // a5: 50 51 52 53 54 55 56 57
+  // a6: 60 61 62 63 64 65 66 67
+  // a7: 70 71 72 73 74 75 76 77
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+  // b2.val[0]: 40 50 42 52 44 54 46 56
+  // b2.val[1]: 41 51 43 53 45 55 47 57
+  // b3.val[0]: 60 70 62 72 64 74 66 76
+  // b3.val[1]: 61 71 63 73 65 75 67 77
+
+  const int16x8x2_t b0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t b1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t b2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t b3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+  // c2.val[0]: 40 50 60 70 44 54 64 74
+  // c2.val[1]: 42 52 62 72 46 56 66 76
+  // c3.val[0]: 41 51 61 71 45 55 65 75
+  // c3.val[1]: 43 53 63 73 47 57 67 77
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+  const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]),
+                                   vreinterpretq_s32_s16(b3.val[0]));
+  const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]),
+                                   vreinterpretq_s32_s16(b3.val[1]));
+
+  // Swap 64 bit elements resulting in:
+  // d0.val[0]: 00 10 20 30 40 50 60 70
+  // d0.val[1]: 04 14 24 34 44 54 64 74
+  // d1.val[0]: 01 11 21 31 41 51 61 71
+  // d1.val[1]: 05 15 25 35 45 55 65 75
+  // d2.val[0]: 02 12 22 32 42 52 62 72
+  // d2.val[1]: 06 16 26 36 46 56 66 76
+  // d3.val[0]: 03 13 23 33 43 53 63 73
+  // d3.val[1]: 07 17 27 37 47 57 67 77
+
+  const int16x8x2_t d0 = aom_vtrnq_s64_to_s16(c0.val[0], c2.val[0]);
+  const int16x8x2_t d1 = aom_vtrnq_s64_to_s16(c1.val[0], c3.val[0]);
+  const int16x8x2_t d2 = aom_vtrnq_s64_to_s16(c0.val[1], c2.val[1]);
+  const int16x8x2_t d3 = aom_vtrnq_s64_to_s16(c1.val[1], c3.val[1]);
+
+  out[0] = d0.val[0];
+  out[1] = d1.val[0];
+  out[2] = d2.val[0];
+  out[3] = d3.val[0];
+  out[4] = d0.val[1];
+  out[5] = d1.val[1];
+  out[6] = d2.val[1];
+  out[7] = d3.val[1];
+}
+
+static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
+                                                   uint16x4_t *a1,
+                                                   uint16x4_t *a2,
+                                                   uint16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const uint16x4x2_t b0 = vtrn_u16(*a0, *a1);
+  const uint16x4x2_t b1 = vtrn_u16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const uint32x2x2_t c0 = vtrn_u32(vreinterpret_u32_u16(b0.val[0]),
+                                   vreinterpret_u32_u16(b1.val[0]));
+  const uint32x2x2_t c1 = vtrn_u32(vreinterpret_u32_u16(b0.val[1]),
+                                   vreinterpret_u32_u16(b1.val[1]));
+
+  *a0 = vreinterpret_u16_u32(c0.val[0]);
+  *a1 = vreinterpret_u16_u32(c1.val[0]);
+  *a2 = vreinterpret_u16_u32(c0.val[1]);
+  *a3 = vreinterpret_u16_u32(c1.val[1]);
+}
+
+static INLINE void transpose_elems_inplace_s16_4x4(int16x4_t *a0, int16x4_t *a1,
+                                                   int16x4_t *a2,
+                                                   int16x4_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int16x4x2_t b0 = vtrn_s16(*a0, *a1);
+  const int16x4x2_t b1 = vtrn_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x2x2_t c0 = vtrn_s32(vreinterpret_s32_s16(b0.val[0]),
+                                  vreinterpret_s32_s16(b1.val[0]));
+  const int32x2x2_t c1 = vtrn_s32(vreinterpret_s32_s16(b0.val[1]),
+                                  vreinterpret_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpret_s16_s32(c0.val[0]);
+  *a1 = vreinterpret_s16_s32(c1.val[0]);
+  *a2 = vreinterpret_s16_s32(c0.val[1]);
+  *a3 = vreinterpret_s16_s32(c1.val[1]);
+}
+
+static INLINE int32x4x2_t aom_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
+  int32x4x2_t b0;
+#if AOM_ARCH_AARCH64
+  b0.val[0] = vreinterpretq_s32_s64(
+      vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+  b0.val[1] = vreinterpretq_s32_s64(
+      vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
+#else
+  b0.val[0] = vcombine_s32(vget_low_s32(a0), vget_low_s32(a1));
+  b0.val[1] = vcombine_s32(vget_high_s32(a0), vget_high_s32(a1));
+#endif
+  return b0;
+}
+
+static INLINE void transpose_elems_s32_4x4(const int32x4_t a0,
+                                           const int32x4_t a1,
+                                           const int32x4_t a2,
+                                           const int32x4_t a3, int32x4_t *o0,
+                                           int32x4_t *o1, int32x4_t *o2,
+                                           int32x4_t *o3) {
+  // Swap 32 bit elements. Goes from:
+  // a0: 00 01 02 03
+  // a1: 10 11 12 13
+  // a2: 20 21 22 23
+  // a3: 30 31 32 33
+  // to:
+  // b0.val[0]: 00 10 02 12
+  // b0.val[1]: 01 11 03 13
+  // b1.val[0]: 20 30 22 32
+  // b1.val[1]: 21 31 23 33
+
+  const int32x4x2_t b0 = vtrnq_s32(a0, a1);
+  const int32x4x2_t b1 = vtrnq_s32(a2, a3);
+
+  // Swap 64 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30
+  // c0.val[1]: 02 12 22 32
+  // c1.val[0]: 01 11 21 31
+  // c1.val[1]: 03 13 23 33
+
+  const int32x4x2_t c0 = aom_vtrnq_s64_to_s32(b0.val[0], b1.val[0]);
+  const int32x4x2_t c1 = aom_vtrnq_s64_to_s32(b0.val[1], b1.val[1]);
+
+  *o0 = c0.val[0];
+  *o1 = c1.val[0];
+  *o2 = c0.val[1];
+  *o3 = c1.val[1];
+}
+
+static INLINE void transpose_elems_inplace_s32_4x4(int32x4_t *a0, int32x4_t *a1,
+                                                   int32x4_t *a2,
+                                                   int32x4_t *a3) {
+  transpose_elems_s32_4x4(*a0, *a1, *a2, *a3, a0, a1, a2, a3);
+}
+
+static INLINE void transpose_arrays_s32_4x4(const int32x4_t *in,
+                                            int32x4_t *out) {
+  transpose_elems_s32_4x4(in[0], in[1], in[2], in[3], &out[0], &out[1], &out[2],
+                          &out[3]);
+}
+
+static AOM_FORCE_INLINE void transpose_arrays_s32_4nx4n(const int32x4_t *in,
+                                                        int32x4_t *out,
+                                                        const int width,
+                                                        const int height) {
+  const int h = height >> 2;
+  const int w = width >> 2;
+  for (int j = 0; j < w; j++) {
+    for (int i = 0; i < h; i++) {
+      transpose_arrays_s32_4x4(in + j * height + i * 4,
+                               out + i * width + j * 4);
+    }
+  }
+}
+
+#define TRANSPOSE_ARRAYS_S32_WXH_NEON(w, h)                    \
+  static AOM_FORCE_INLINE void transpose_arrays_s32_##w##x##h( \
+      const int32x4_t *in, int32x4_t *out) {                   \
+    transpose_arrays_s32_4nx4n(in, out, w, h);                 \
+  }
+
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(4, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 4)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(8, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(16, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 8)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 32)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(32, 64)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 16)
+TRANSPOSE_ARRAYS_S32_WXH_NEON(64, 32)
+
+#undef TRANSPOSE_ARRAYS_S32_WXH_NEON
+
+static INLINE int64x2_t aom_vtrn1q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+  return vtrn1q_s64(a, b);
+#else
+  return vcombine_s64(vget_low_s64(a), vget_low_s64(b));
+#endif
+}
+
+static INLINE int64x2_t aom_vtrn2q_s64(int64x2_t a, int64x2_t b) {
+#if AOM_ARCH_AARCH64
+  return vtrn2q_s64(a, b);
+#else
+  return vcombine_s64(vget_high_s64(a), vget_high_s64(b));
+#endif
+}
+
+static INLINE void transpose_elems_s32_4x8(int32x4_t a0, int32x4_t a1,
+                                           int32x4_t a2, int32x4_t a3,
+                                           int32x4_t a4, int32x4_t a5,
+                                           int32x4_t a6, int32x4_t a7,
+                                           int32x4x2_t *o0, int32x4x2_t *o1,
+                                           int32x4x2_t *o2, int32x4x2_t *o3) {
+  // Perform a 4 x 8 matrix transpose by building on top of the existing 4 x 4
+  // matrix transpose implementation:
+  // [ A ]^T => [ A^T B^T ]
+  // [ B ]
+
+  transpose_elems_inplace_s32_4x4(&a0, &a1, &a2, &a3);  // A^T
+  transpose_elems_inplace_s32_4x4(&a4, &a5, &a6, &a7);  // B^T
+
+  o0->val[0] = a0;
+  o1->val[0] = a1;
+  o2->val[0] = a2;
+  o3->val[0] = a3;
+
+  o0->val[1] = a4;
+  o1->val[1] = a5;
+  o2->val[1] = a6;
+  o3->val[1] = a7;
+}
+
+static INLINE void transpose_elems_inplace_s32_8x8(
+    int32x4x2_t *a0, int32x4x2_t *a1, int32x4x2_t *a2, int32x4x2_t *a3,
+    int32x4x2_t *a4, int32x4x2_t *a5, int32x4x2_t *a6, int32x4x2_t *a7) {
+  // Perform an 8 x 8 matrix transpose by building on top of the existing 4 x 4
+  // matrix transpose implementation:
+  // [ A B ]^T => [ A^T C^T ]
+  // [ C D ]      [ B^T D^T ]
+
+  int32x4_t q0_v1 = a0->val[0];
+  int32x4_t q0_v2 = a1->val[0];
+  int32x4_t q0_v3 = a2->val[0];
+  int32x4_t q0_v4 = a3->val[0];
+
+  int32x4_t q1_v1 = a0->val[1];
+  int32x4_t q1_v2 = a1->val[1];
+  int32x4_t q1_v3 = a2->val[1];
+  int32x4_t q1_v4 = a3->val[1];
+
+  int32x4_t q2_v1 = a4->val[0];
+  int32x4_t q2_v2 = a5->val[0];
+  int32x4_t q2_v3 = a6->val[0];
+  int32x4_t q2_v4 = a7->val[0];
+
+  int32x4_t q3_v1 = a4->val[1];
+  int32x4_t q3_v2 = a5->val[1];
+  int32x4_t q3_v3 = a6->val[1];
+  int32x4_t q3_v4 = a7->val[1];
+
+  transpose_elems_inplace_s32_4x4(&q0_v1, &q0_v2, &q0_v3, &q0_v4);  // A^T
+  transpose_elems_inplace_s32_4x4(&q1_v1, &q1_v2, &q1_v3, &q1_v4);  // B^T
+  transpose_elems_inplace_s32_4x4(&q2_v1, &q2_v2, &q2_v3, &q2_v4);  // C^T
+  transpose_elems_inplace_s32_4x4(&q3_v1, &q3_v2, &q3_v3, &q3_v4);  // D^T
+
+  a0->val[0] = q0_v1;
+  a1->val[0] = q0_v2;
+  a2->val[0] = q0_v3;
+  a3->val[0] = q0_v4;
+
+  a0->val[1] = q2_v1;
+  a1->val[1] = q2_v2;
+  a2->val[1] = q2_v3;
+  a3->val[1] = q2_v4;
+
+  a4->val[0] = q1_v1;
+  a5->val[0] = q1_v2;
+  a6->val[0] = q1_v3;
+  a7->val[0] = q1_v4;
+
+  a4->val[1] = q3_v1;
+  a5->val[1] = q3_v2;
+  a6->val[1] = q3_v3;
+  a7->val[1] = q3_v4;
+}
+
+static INLINE void transpose_arrays_s16_4x4(const int16x4_t *const in,
+                                            int16x4_t *const out) {
+  int16x4_t a0 = in[0];
+  int16x4_t a1 = in[1];
+  int16x4_t a2 = in[2];
+  int16x4_t a3 = in[3];
+
+  transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3);
+
+  out[0] = a0;
+  out[1] = a1;
+  out[2] = a2;
+  out[3] = a3;
+}
+
+static INLINE void transpose_arrays_s16_4x8(const int16x4_t *const in,
+                                            int16x8_t *const out) {
+#if AOM_ARCH_AARCH64
+  const int16x8_t a0 = vzip1q_s16(vcombine_s16(in[0], vdup_n_s16(0)),
+                                  vcombine_s16(in[1], vdup_n_s16(0)));
+  const int16x8_t a1 = vzip1q_s16(vcombine_s16(in[2], vdup_n_s16(0)),
+                                  vcombine_s16(in[3], vdup_n_s16(0)));
+  const int16x8_t a2 = vzip1q_s16(vcombine_s16(in[4], vdup_n_s16(0)),
+                                  vcombine_s16(in[5], vdup_n_s16(0)));
+  const int16x8_t a3 = vzip1q_s16(vcombine_s16(in[6], vdup_n_s16(0)),
+                                  vcombine_s16(in[7], vdup_n_s16(0)));
+#else
+  int16x4x2_t temp;
+  temp = vzip_s16(in[0], in[1]);
+  const int16x8_t a0 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[2], in[3]);
+  const int16x8_t a1 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[4], in[5]);
+  const int16x8_t a2 = vcombine_s16(temp.val[0], temp.val[1]);
+  temp = vzip_s16(in[6], in[7]);
+  const int16x8_t a3 = vcombine_s16(temp.val[0], temp.val[1]);
+#endif
+
+  const int32x4x2_t b02 =
+      vzipq_s32(vreinterpretq_s32_s16(a0), vreinterpretq_s32_s16(a1));
+  const int32x4x2_t b13 =
+      vzipq_s32(vreinterpretq_s32_s16(a2), vreinterpretq_s32_s16(a3));
+
+#if AOM_ARCH_AARCH64
+  out[0] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[1] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[0]),
+                                            vreinterpretq_s64_s32(b13.val[0])));
+  out[2] = vreinterpretq_s16_s64(vzip1q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+  out[3] = vreinterpretq_s16_s64(vzip2q_s64(vreinterpretq_s64_s32(b02.val[1]),
+                                            vreinterpretq_s64_s32(b13.val[1])));
+#else
+  out[0] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[0], b02.val[0], 2), b13.val[0], 2));
+  out[2] = vreinterpretq_s16_s32(
+      vextq_s32(vextq_s32(b02.val[1], b02.val[1], 2), b13.val[1], 2));
+  out[1] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[0], vextq_s32(b13.val[0], b13.val[0], 2), 2));
+  out[3] = vreinterpretq_s16_s32(
+      vextq_s32(b02.val[1], vextq_s32(b13.val[1], b13.val[1], 2), 2));
+#endif
+}
+
+static INLINE void transpose_arrays_s16_8x4(const int16x8_t *const in,
+                                            int16x4_t *const out) {
+  // Swap 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const int16x8x2_t b0 = vtrnq_s16(in[0], in[1]);
+  const int16x8x2_t b1 = vtrnq_s16(in[2], in[3]);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 02 12 22 32 06 16 26 36
+  // c1.val[0]: 01 11 21 31 05 15 25 35
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const uint32x4x2_t c0 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[0]),
+                                    vreinterpretq_u32_s16(b1.val[0]));
+  const uint32x4x2_t c1 = vtrnq_u32(vreinterpretq_u32_s16(b0.val[1]),
+                                    vreinterpretq_u32_s16(b1.val[1]));
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+
+  out[0] = vget_low_s16(vreinterpretq_s16_u32(c0.val[0]));
+  out[1] = vget_low_s16(vreinterpretq_s16_u32(c1.val[0]));
+  out[2] = vget_low_s16(vreinterpretq_s16_u32(c0.val[1]));
+  out[3] = vget_low_s16(vreinterpretq_s16_u32(c1.val[1]));
+  out[4] = vget_high_s16(vreinterpretq_s16_u32(c0.val[0]));
+  out[5] = vget_high_s16(vreinterpretq_s16_u32(c1.val[0]));
+  out[6] = vget_high_s16(vreinterpretq_s16_u32(c0.val[1]));
+  out[7] = vget_high_s16(vreinterpretq_s16_u32(c1.val[1]));
+}
+
+#endif  // AOM_AOM_DSP_ARM_TRANSPOSE_NEON_H_
diff --git a/third_party/aom/aom_dsp/arm/variance_neon.c b/third_party/aom/aom_dsp/arm/variance_neon.c
new file mode 100644
index 0000000000..9e4e8c0cf0
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon.c
@@ -0,0 +1,470 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32 = vdupq_n_s32(0);
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128, but we use an 8-wide accumulator; so 256 4-wide rows.
+  assert(h <= 256);
+
+  int i = h;
+  do {
+    uint8x8_t s = load_unaligned_u8(src, src_stride);
+    uint8x8_t r = load_unaligned_u8(ref, ref_stride);
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32 = vmlal_s16(sse_s32, vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32 = vmlal_s16(sse_s32, vget_high_s16(diff), vget_high_s16(diff));
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(sse_s32);
+}
+
+static INLINE void variance_8xh_neon(const uint8_t *src, int src_stride,
+                                     const uint8_t *ref, int ref_stride, int h,
+                                     uint32_t *sse, int *sum) {
+  int16x8_t sum_s16 = vdupq_n_s16(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // Number of rows we can process before 'sum_s16' overflows:
+  // 32767 / 255 ~= 128
+  assert(h <= 128);
+
+  int i = h;
+  do {
+    uint8x8_t s = vld1_u8(src);
+    uint8x8_t r = vld1_u8(ref);
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s, r));
+
+    sum_s16 = vaddq_s16(sum_s16, diff);
+
+    sse_s32[0] = vmlal_s16(sse_s32[0], vget_low_s16(diff), vget_low_s16(diff));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff), vget_high_s16(diff));
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_s16x8(sum_s16);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_16xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // Number of rows we can process before 'sum_s16' accumulators overflow:
+  // 32767 / 255 ~= 128, so 128 16-wide rows.
+  assert(h <= 128);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src);
+    uint8x16_t r = vld1q_u8(ref);
+
+    int16x8_t diff_l =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+    int16x8_t diff_h =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+    sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+    sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+    sse_s32[0] =
+        vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+    sse_s32[1] =
+        vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  *sum = horizontal_add_s16x8(vaddq_s16(sum_s16[0], sum_s16[1]));
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_large_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int w, int h, int h_limit, uint32_t *sse,
+                                       int *sum) {
+  int32x4_t sum_s32 = vdupq_n_s32(0);
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  // 'h_limit' is the number of 'w'-width rows we can process before our 16-bit
+  // accumulator overflows. After hitting this limit we accumulate into 32-bit
+  // elements.
+  int h_tmp = h > h_limit ? h_limit : h;
+
+  int i = 0;
+  do {
+    int16x8_t sum_s16[2] = { vdupq_n_s16(0), vdupq_n_s16(0) };
+    do {
+      int j = 0;
+      do {
+        uint8x16_t s = vld1q_u8(src + j);
+        uint8x16_t r = vld1q_u8(ref + j);
+
+        int16x8_t diff_l =
+            vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(s), vget_low_u8(r)));
+        int16x8_t diff_h =
+            vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(s), vget_high_u8(r)));
+
+        sum_s16[0] = vaddq_s16(sum_s16[0], diff_l);
+        sum_s16[1] = vaddq_s16(sum_s16[1], diff_h);
+
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_l), vget_low_s16(diff_l));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_l), vget_high_s16(diff_l));
+        sse_s32[0] =
+            vmlal_s16(sse_s32[0], vget_low_s16(diff_h), vget_low_s16(diff_h));
+        sse_s32[1] =
+            vmlal_s16(sse_s32[1], vget_high_s16(diff_h), vget_high_s16(diff_h));
+
+        j += 16;
+      } while (j < w);
+
+      src += src_stride;
+      ref += ref_stride;
+      i++;
+    } while (i < h_tmp);
+
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[0]);
+    sum_s32 = vpadalq_s16(sum_s32, sum_s16[1]);
+
+    h_tmp += h_limit;
+  } while (i < h);
+
+  *sum = horizontal_add_s32x4(sum_s32);
+  *sse = (uint32_t)horizontal_add_s32x4(vaddq_s32(sse_s32[0], sse_s32[1]));
+}
+
+static INLINE void variance_32xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 32, h, 64, sse, sum);
+}
+
+static INLINE void variance_64xh_neon(const uint8_t *src, int src_stride,
+                                      const uint8_t *ref, int ref_stride, int h,
+                                      uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 64, h, 32, sse, sum);
+}
+
+static INLINE void variance_128xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       int h, uint32_t *sse, int *sum) {
+  variance_large_neon(src, src_stride, ref, ref_stride, 128, h, 16, sse, sum);
+}
+
+#define VARIANCE_WXH_NEON(w, h, shift)                                        \
+  unsigned int aom_variance##w##x##h##_neon(                                  \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse, &sum);    \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON(4, 4, 4)
+VARIANCE_WXH_NEON(4, 8, 5)
+VARIANCE_WXH_NEON(4, 16, 6)
+
+VARIANCE_WXH_NEON(8, 4, 5)
+VARIANCE_WXH_NEON(8, 8, 6)
+VARIANCE_WXH_NEON(8, 16, 7)
+VARIANCE_WXH_NEON(8, 32, 8)
+
+VARIANCE_WXH_NEON(16, 4, 6)
+VARIANCE_WXH_NEON(16, 8, 7)
+VARIANCE_WXH_NEON(16, 16, 8)
+VARIANCE_WXH_NEON(16, 32, 9)
+VARIANCE_WXH_NEON(16, 64, 10)
+
+VARIANCE_WXH_NEON(32, 8, 8)
+VARIANCE_WXH_NEON(32, 16, 9)
+VARIANCE_WXH_NEON(32, 32, 10)
+VARIANCE_WXH_NEON(32, 64, 11)
+
+VARIANCE_WXH_NEON(64, 16, 10)
+VARIANCE_WXH_NEON(64, 32, 11)
+VARIANCE_WXH_NEON(64, 64, 12)
+VARIANCE_WXH_NEON(64, 128, 13)
+
+VARIANCE_WXH_NEON(128, 64, 13)
+VARIANCE_WXH_NEON(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON
+
+// TODO(yunqingwang): Perform variance of two/four 8x8 blocks similar to that of
+// AVX2. Also, implement the NEON for variance computation present in this
+// function.
+void aom_get_var_sse_sum_8x8_quad_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
+  // Loop over four 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance_8xh_neon(src + (k * 8), src_stride, ref + (k * 8), ref_stride, 8,
+                      &sse8x8[k], &sum8x8[k]);
+  }
+
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++) {
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+  }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over two 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance_16xh_neon(src + (k * 16), src_stride, ref + (k * 16), ref_stride,
+                       16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++) {
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+  }
+}
+
+static INLINE unsigned int mse8xh_neon(const uint8_t *src, int src_stride,
+                                       const uint8_t *ref, int ref_stride,
+                                       unsigned int *sse, int h) {
+  uint8x8_t s[2], r[2];
+  int16x4_t diff_lo[2], diff_hi[2];
+  uint16x8_t diff[2];
+  int32x4_t sse_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    s[0] = vld1_u8(src);
+    src += src_stride;
+    s[1] = vld1_u8(src);
+    src += src_stride;
+    r[0] = vld1_u8(ref);
+    ref += ref_stride;
+    r[1] = vld1_u8(ref);
+    ref += ref_stride;
+
+    diff[0] = vsubl_u8(s[0], r[0]);
+    diff[1] = vsubl_u8(s[1], r[1]);
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+    i -= 2;
+  } while (i != 0);
+
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+
+  *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+  return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+static INLINE unsigned int mse16xh_neon(const uint8_t *src, int src_stride,
+                                        const uint8_t *ref, int ref_stride,
+                                        unsigned int *sse, int h) {
+  uint8x16_t s[2], r[2];
+  int16x4_t diff_lo[4], diff_hi[4];
+  uint16x8_t diff[4];
+  int32x4_t sse_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                           vdupq_n_s32(0) };
+
+  int i = h;
+  do {
+    s[0] = vld1q_u8(src);
+    src += src_stride;
+    s[1] = vld1q_u8(src);
+    src += src_stride;
+    r[0] = vld1q_u8(ref);
+    ref += ref_stride;
+    r[1] = vld1q_u8(ref);
+    ref += ref_stride;
+
+    diff[0] = vsubl_u8(vget_low_u8(s[0]), vget_low_u8(r[0]));
+    diff[1] = vsubl_u8(vget_high_u8(s[0]), vget_high_u8(r[0]));
+    diff[2] = vsubl_u8(vget_low_u8(s[1]), vget_low_u8(r[1]));
+    diff[3] = vsubl_u8(vget_high_u8(s[1]), vget_high_u8(r[1]));
+
+    diff_lo[0] = vreinterpret_s16_u16(vget_low_u16(diff[0]));
+    diff_lo[1] = vreinterpret_s16_u16(vget_low_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_lo[0], diff_lo[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_lo[1], diff_lo[1]);
+
+    diff_lo[2] = vreinterpret_s16_u16(vget_low_u16(diff[2]));
+    diff_lo[3] = vreinterpret_s16_u16(vget_low_u16(diff[3]));
+    sse_s32[2] = vmlal_s16(sse_s32[2], diff_lo[2], diff_lo[2]);
+    sse_s32[3] = vmlal_s16(sse_s32[3], diff_lo[3], diff_lo[3]);
+
+    diff_hi[0] = vreinterpret_s16_u16(vget_high_u16(diff[0]));
+    diff_hi[1] = vreinterpret_s16_u16(vget_high_u16(diff[1]));
+    sse_s32[0] = vmlal_s16(sse_s32[0], diff_hi[0], diff_hi[0]);
+    sse_s32[1] = vmlal_s16(sse_s32[1], diff_hi[1], diff_hi[1]);
+
+    diff_hi[2] = vreinterpret_s16_u16(vget_high_u16(diff[2]));
+    diff_hi[3] = vreinterpret_s16_u16(vget_high_u16(diff[3]));
+    sse_s32[2] = vmlal_s16(sse_s32[2], diff_hi[2], diff_hi[2]);
+    sse_s32[3] = vmlal_s16(sse_s32[3], diff_hi[3], diff_hi[3]);
+
+    i -= 2;
+  } while (i != 0);
+
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[1]);
+  sse_s32[2] = vaddq_s32(sse_s32[2], sse_s32[3]);
+  sse_s32[0] = vaddq_s32(sse_s32[0], sse_s32[2]);
+
+  *sse = horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+  return horizontal_add_u32x4(vreinterpretq_u32_s32(sse_s32[0]));
+}
+
+#define MSE_WXH_NEON(w, h)                                                 \
+  unsigned int aom_mse##w##x##h##_neon(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse) {                \
+    return mse##w##xh_neon(src, src_stride, ref, ref_stride, sse, h);      \
+  }
+
+MSE_WXH_NEON(8, 8)
+MSE_WXH_NEON(8, 16)
+
+MSE_WXH_NEON(16, 8)
+MSE_WXH_NEON(16, 16)
+
+#undef MSE_WXH_NEON
+
+static INLINE uint64x2_t mse_accumulate_u16_u8_8x2(uint64x2_t sum,
+                                                   uint16x8_t s0, uint16x8_t s1,
+                                                   uint8x8_t d0, uint8x8_t d1) {
+  int16x8_t e0 = vreinterpretq_s16_u16(vsubw_u8(s0, d0));
+  int16x8_t e1 = vreinterpretq_s16_u16(vsubw_u8(s1, d1));
+
+  int32x4_t mse = vmull_s16(vget_low_s16(e0), vget_low_s16(e0));
+  mse = vmlal_s16(mse, vget_high_s16(e0), vget_high_s16(e0));
+  mse = vmlal_s16(mse, vget_low_s16(e1), vget_low_s16(e1));
+  mse = vmlal_s16(mse, vget_high_s16(e1), vget_high_s16(e1));
+
+  return vpadalq_u32(sum, vreinterpretq_u32_s32(mse));
+}
+
+static uint64x2_t mse_wxh_16bit(uint8_t *dst, int dstride, const uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4));
+
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  if (w == 8) {
+    do {
+      uint8x8_t d0 = vld1_u8(dst + 0 * dstride);
+      uint8x8_t d1 = vld1_u8(dst + 1 * dstride);
+      uint16x8_t s0 = vld1q_u16(src + 0 * sstride);
+      uint16x8_t s1 = vld1q_u16(src + 1 * sstride);
+
+      sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+      dst += 2 * dstride;
+      src += 2 * sstride;
+      h -= 2;
+    } while (h != 0);
+  } else {
+    do {
+      uint8x8_t d0 = load_unaligned_u8_4x2(dst + 0 * dstride, dstride);
+      uint8x8_t d1 = load_unaligned_u8_4x2(dst + 2 * dstride, dstride);
+      uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride);
+      uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride);
+
+      sum = mse_accumulate_u16_u8_8x2(sum, s0, s1, d0, d1);
+
+      dst += 4 * dstride;
+      src += 4 * sstride;
+      h -= 4;
+    } while (h != 0);
+  }
+
+  return sum;
+}
+
+// Computes mse for a given block size. This function gets called for specific
+// block sizes, which are 8x8, 8x4, 4x8 and 4x4.
+uint64_t aom_mse_wxh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  return horizontal_add_u64x2(mse_wxh_16bit(dst, dstride, src, sstride, w, h));
+}
+
+uint32_t aom_get_mb_ss_neon(const int16_t *a) {
+  int32x4_t sse[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  for (int i = 0; i < 256; i = i + 8) {
+    int16x8_t a_s16 = vld1q_s16(a + i);
+
+    sse[0] = vmlal_s16(sse[0], vget_low_s16(a_s16), vget_low_s16(a_s16));
+    sse[1] = vmlal_s16(sse[1], vget_high_s16(a_s16), vget_high_s16(a_s16));
+  }
+
+  return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
+}
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  int num_blks = 16 / w;
+  do {
+    sum = vaddq_u64(sum, mse_wxh_16bit(dst, dstride, src, w, w, h));
+    dst += w;
+    src += w * h;
+  } while (--num_blks != 0);
+
+  return horizontal_add_u64x2(sum);
+}
diff --git a/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
new file mode 100644
index 0000000000..9fb52e1df7
--- /dev/null
+++ b/third_party/aom/aom_dsp/arm/variance_neon_dotprod.c
@@ -0,0 +1,314 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "aom_ports/mem.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void variance_4xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = load_unaligned_u8q(src, src_stride);
+    uint8x16_t r = load_unaligned_u8q(ref, ref_stride);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 4 * src_stride;
+    ref += 4 * ref_stride;
+    i -= 4;
+  } while (i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_8xh_neon_dotprod(const uint8_t *src, int src_stride,
+                                             const uint8_t *ref, int ref_stride,
+                                             int h, uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_16xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vld1q_u8(src);
+    uint8x16_t r = vld1q_u8(ref);
+
+    src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+    ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_large_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride, int w, int h,
+                                               uint32_t *sse, int *sum) {
+  uint32x4_t src_sum = vdupq_n_u32(0);
+  uint32x4_t ref_sum = vdupq_n_u32(0);
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s = vld1q_u8(src + j);
+      uint8x16_t r = vld1q_u8(ref + j);
+
+      src_sum = vdotq_u32(src_sum, s, vdupq_n_u8(1));
+      ref_sum = vdotq_u32(ref_sum, r, vdupq_n_u8(1));
+
+      uint8x16_t abs_diff = vabdq_u8(s, r);
+      sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+      j += 16;
+    } while (j < w);
+
+    src += src_stride;
+    ref += ref_stride;
+  } while (--i != 0);
+
+  int32x4_t sum_diff =
+      vsubq_s32(vreinterpretq_s32_u32(src_sum), vreinterpretq_s32_u32(ref_sum));
+  *sum = horizontal_add_s32x4(sum_diff);
+  *sse = horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE void variance_32xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 32, h, sse,
+                              sum);
+}
+
+static INLINE void variance_64xh_neon_dotprod(const uint8_t *src,
+                                              int src_stride,
+                                              const uint8_t *ref,
+                                              int ref_stride, int h,
+                                              uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 64, h, sse,
+                              sum);
+}
+
+static INLINE void variance_128xh_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride, int h,
+                                               uint32_t *sse, int *sum) {
+  variance_large_neon_dotprod(src, src_stride, ref, ref_stride, 128, h, sse,
+                              sum);
+}
+
+#define VARIANCE_WXH_NEON_DOTPROD(w, h, shift)                                \
+  unsigned int aom_variance##w##x##h##_neon_dotprod(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    variance_##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, h, sse,   \
+                                  &sum);                                      \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);                  \
+  }
+
+VARIANCE_WXH_NEON_DOTPROD(4, 4, 4)
+VARIANCE_WXH_NEON_DOTPROD(4, 8, 5)
+VARIANCE_WXH_NEON_DOTPROD(4, 16, 6)
+
+VARIANCE_WXH_NEON_DOTPROD(8, 4, 5)
+VARIANCE_WXH_NEON_DOTPROD(8, 8, 6)
+VARIANCE_WXH_NEON_DOTPROD(8, 16, 7)
+VARIANCE_WXH_NEON_DOTPROD(8, 32, 8)
+
+VARIANCE_WXH_NEON_DOTPROD(16, 4, 6)
+VARIANCE_WXH_NEON_DOTPROD(16, 8, 7)
+VARIANCE_WXH_NEON_DOTPROD(16, 16, 8)
+VARIANCE_WXH_NEON_DOTPROD(16, 32, 9)
+VARIANCE_WXH_NEON_DOTPROD(16, 64, 10)
+
+VARIANCE_WXH_NEON_DOTPROD(32, 8, 8)
+VARIANCE_WXH_NEON_DOTPROD(32, 16, 9)
+VARIANCE_WXH_NEON_DOTPROD(32, 32, 10)
+VARIANCE_WXH_NEON_DOTPROD(32, 64, 11)
+
+VARIANCE_WXH_NEON_DOTPROD(64, 16, 10)
+VARIANCE_WXH_NEON_DOTPROD(64, 32, 11)
+VARIANCE_WXH_NEON_DOTPROD(64, 64, 12)
+VARIANCE_WXH_NEON_DOTPROD(64, 128, 13)
+
+VARIANCE_WXH_NEON_DOTPROD(128, 64, 13)
+VARIANCE_WXH_NEON_DOTPROD(128, 128, 14)
+
+#undef VARIANCE_WXH_NEON_DOTPROD
+
+void aom_get_var_sse_sum_8x8_quad_neon_dotprod(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    uint32_t *sse8x8, int *sum8x8, unsigned int *tot_sse, int *tot_sum,
+    uint32_t *var8x8) {
+  // Loop over four 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance_8xh_neon_dotprod(src + (k * 8), src_stride, ref + (k * 8),
+                              ref_stride, 8, &sse8x8[k], &sum8x8[k]);
+  }
+
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++) {
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+  }
+}
+
+void aom_get_var_sse_sum_16x16_dual_neon_dotprod(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    uint32_t *sse16x16, unsigned int *tot_sse, int *tot_sum,
+    uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over two 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance_16xh_neon_dotprod(src + (k * 16), src_stride, ref + (k * 16),
+                               ref_stride, 16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++) {
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+  }
+}
+
+static INLINE unsigned int mse8xh_neon_dotprod(const uint8_t *src,
+                                               int src_stride,
+                                               const uint8_t *ref,
+                                               int ref_stride,
+                                               unsigned int *sse, int h) {
+  uint32x4_t sse_u32 = vdupq_n_u32(0);
+
+  int i = h;
+  do {
+    uint8x16_t s = vcombine_u8(vld1_u8(src), vld1_u8(src + src_stride));
+    uint8x16_t r = vcombine_u8(vld1_u8(ref), vld1_u8(ref + ref_stride));
+
+    uint8x16_t abs_diff = vabdq_u8(s, r);
+
+    sse_u32 = vdotq_u32(sse_u32, abs_diff, abs_diff);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sse = horizontal_add_u32x4(sse_u32);
+  return horizontal_add_u32x4(sse_u32);
+}
+
+static INLINE unsigned int mse16xh_neon_dotprod(const uint8_t *src,
+                                                int src_stride,
+                                                const uint8_t *ref,
+                                                int ref_stride,
+                                                unsigned int *sse, int h) {
+  uint32x4_t sse_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) };
+
+  int i = h;
+  do {
+    uint8x16_t s0 = vld1q_u8(src);
+    uint8x16_t s1 = vld1q_u8(src + src_stride);
+    uint8x16_t r0 = vld1q_u8(ref);
+    uint8x16_t r1 = vld1q_u8(ref + ref_stride);
+
+    uint8x16_t abs_diff0 = vabdq_u8(s0, r0);
+    uint8x16_t abs_diff1 = vabdq_u8(s1, r1);
+
+    sse_u32[0] = vdotq_u32(sse_u32[0], abs_diff0, abs_diff0);
+    sse_u32[1] = vdotq_u32(sse_u32[1], abs_diff1, abs_diff1);
+
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+    i -= 2;
+  } while (i != 0);
+
+  *sse = horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+  return horizontal_add_u32x4(vaddq_u32(sse_u32[0], sse_u32[1]));
+}
+
+#define MSE_WXH_NEON_DOTPROD(w, h)                                            \
+  unsigned int aom_mse##w##x##h##_neon_dotprod(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    return mse##w##xh_neon_dotprod(src, src_stride, ref, ref_stride, sse, h); \
+  }
+
+MSE_WXH_NEON_DOTPROD(8, 8)
+MSE_WXH_NEON_DOTPROD(8, 16)
+
+MSE_WXH_NEON_DOTPROD(16, 8)
+MSE_WXH_NEON_DOTPROD(16, 16)
+
+#undef MSE_WXH_NEON_DOTPROD
diff --git a/third_party/aom/aom_dsp/avg.c b/third_party/aom/aom_dsp/avg.c
new file mode 100644
index 0000000000..893f9c2f65
--- /dev/null
+++ b/third_party/aom/aom_dsp/avg.c
@@ -0,0 +1,573 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_ports/mem.h"
+
+void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
+unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
+  int i, j;
+  int sum = 0;
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
+void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                        int *avg) {
+  for (int k = 0; k < 4; k++) {
+    const int x8_idx = x16_idx + ((k & 1) << 3);
+    const int y8_idx = y16_idx + ((k >> 1) << 3);
+    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
+    avg[k] = aom_avg_8x8_c(s_tmp, p);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 8; ++i, s += p)
+    for (j = 0; j < 8; sum += s[j], ++j) {
+    }
+
+  return (sum + 32) >> 6;
+}
+
+unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+  int i, j;
+  int sum = 0;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  for (i = 0; i < 4; ++i, s += p)
+    for (j = 0; j < 4; sum += s[j], ++j) {
+    }
+
+  return (sum + 8) >> 4;
+}
+
+void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
+  *min = 65535;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j] - d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
+  int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1;
+  int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1;
+  int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1;
+
+  coeff[0] = b0 + b2;
+  coeff[1] = b1 + b3;
+  coeff[2] = b0 - b2;
+  coeff[3] = b1 - b3;
+}
+
+void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[16];
+  int16_t buffer2[16];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 4; ++idx) {
+    hadamard_col4(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 4;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 4; ++idx) {
+    hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2).
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i];
+    }
+  }
+}
+
+// src_diff: first pass, 9 bit, dynamic range [-255, 255]
+//           second pass, 12 bit, dynamic range [-2040, 2040]
+static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                        tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2).
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i];
+    }
+  }
+}
+
+void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                           int16_t *coeff) {
+  int16_t buffer[64];
+  int16_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
+                                                   // dynamic range [-255, 255]
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (int idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
+    // dynamic range [-2040, 2040]
+    // buffer2: 15 bit
+    // dynamic range [-16320, 16320]
+    ++tmp_buf;
+  }
+
+  for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
+
+  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2).
+  for (int i = 0; i < 8; i++) {
+    for (int j = 0; j < 8; j++) {
+      coeff[i * 8 + j] = buffer2[j * 8 + i];
+    }
+  }
+}
+
+void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride,
+                          (int16_t *)coeff + (i * 64));
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 15 bit, dynamic range [-16320, 16320]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+
+  coeff -= 64;
+  // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2).
+  // Note that to match SSE2 output, it does not need this step.
+  for (int i = 0; i < 16; i++) {
+    for (int j = 0; j < 4; j++) {
+      tran_low_t temp = coeff[i * 16 + 4 + j];
+      coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j];
+      coeff[i * 16 + 8 + j] = temp;
+    }
+  }
+}
+
+void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                             int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (int idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
+    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
+    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
+    int16_t b3 = (a2 - a3) >> 1;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                          tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 16 bit, dynamic range [-32768, 32767]
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
+    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
+    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
+                                            ptrdiff_t src_stride,
+                                            int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// src_diff: 16 bit, dynamic range [-32760, 32760]
+// coeff: 19 bit
+static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
+                                             ptrdiff_t src_stride,
+                                             int32_t *coeff) {
+  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int32_t c0 = b0 + b2;
+  int32_t c1 = b1 + b3;
+  int32_t c2 = b0 - b2;
+  int32_t c3 = b1 - b3;
+  int32_t c4 = b4 + b6;
+  int32_t c5 = b5 + b7;
+  int32_t c6 = b4 - b6;
+  int32_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+// The order of the output coeff of the hadamard is not important. For
+// optimization purposes the final transpose may be skipped.
+void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                               tran_low_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int32_t buffer2[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // src_diff: 13 bit
+    // buffer: 16 bit, dynamic range [-32760, 32760]
+    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    // buffer: 16 bit
+    // buffer2: 19 bit, dynamic range [-262080, 262080]
+    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
+    ++tmp_buf;
+  }
+
+  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
+}
+
+// In place 16x16 2D Hadamard transform
+void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  // coeff: 19 bit, dynamic range [-262080, 262080]
+  for (idx = 0; idx < 64; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[64];
+    tran_low_t a2 = coeff[128];
+    tran_low_t a3 = coeff[192];
+
+    tran_low_t b0 = (a0 + a1) >> 1;
+    tran_low_t b1 = (a0 - a1) >> 1;
+    tran_low_t b2 = (a2 + a3) >> 1;
+    tran_low_t b3 = (a2 - a3) >> 1;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[64] = b1 + b3;
+    coeff[128] = b0 - b2;
+    coeff[192] = b1 - b3;
+
+    ++coeff;
+  }
+}
+
+void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
+                                 tran_low_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 13 bit, dynamic range [-4095, 4095]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
+  }
+
+  // coeff: 20 bit
+  for (idx = 0; idx < 256; ++idx) {
+    tran_low_t a0 = coeff[0];
+    tran_low_t a1 = coeff[256];
+    tran_low_t a2 = coeff[512];
+    tran_low_t a3 = coeff[768];
+
+    tran_low_t b0 = (a0 + a1) >> 2;
+    tran_low_t b1 = (a0 - a1) >> 2;
+    tran_low_t b2 = (a2 + a3) >> 2;
+    tran_low_t b3 = (a2 - a3) >> 2;
+
+    // new coeff dynamic range: 20 bit
+    coeff[0] = b0 + b2;
+    coeff[256] = b1 + b3;
+    coeff[512] = b0 - b2;
+    coeff[768] = b1 - b3;
+
+    ++coeff;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// coeff: 20 bits, dynamic range [-524287, 524287].
+// length: value range {16, 32, 64, 128, 256, 512, 1024}.
+int aom_satd_c(const tran_low_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
+  return satd;
+}
+
+int aom_satd_lp_c(const int16_t *coeff, int length) {
+  int satd = 0;
+  for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
+
+  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
+  return satd;
+}
+
+// Integer projection onto row vectors.
+// height: value range {16, 32, 64, 128}.
+void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
+  assert(height >= 2);
+  for (int idx = 0; idx < width; ++idx) {
+    hbuf[idx] = 0;
+    // hbuf[idx]: 14 bit, dynamic range [0, 32640].
+    for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
+    // hbuf[idx]: 9 bit, dynamic range [0, 1020].
+    hbuf[idx] >>= norm_factor;
+    ++ref;
+  }
+}
+
+// width: value range {16, 32, 64, 128}.
+void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
+                       const int width, const int height, int norm_factor) {
+  for (int ht = 0; ht < height; ++ht) {
+    int16_t sum = 0;
+    // sum: 14 bit, dynamic range [0, 32640]
+    for (int idx = 0; idx < width; ++idx) sum += ref[idx];
+    vbuf[ht] = sum >> norm_factor;
+    ref += ref_stride;
+  }
+}
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
+  int i;
+  int width = 4 << bwl;
+  int sse = 0, mean = 0, var;
+
+  for (i = 0; i < width; ++i) {
+    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
+    mean += diff;                // mean: dynamic range 16 bits.
+    sse += diff * diff;          // sse:  dynamic range 26 bits.
+  }
+
+  // (mean * mean): dynamic range 31 bits.
+  // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
+  // 31.99, so it needs to be casted to unsigned int to compute its square.
+  const unsigned int mean_abs = abs(mean);
+  var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
+  return var;
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.c b/third_party/aom/aom_dsp/binary_codes_reader.c
new file mode 100644
index 0000000000..ee0ce62278
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.c
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/binary_codes_reader.h"
+#include "aom_dsp/recenter.h"
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r,
+                                      uint16_t n ACCT_STR_PARAM) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_read_literal(r, l - 1, ACCT_STR_NAME);
+  return v < m ? v : (v << 1) - m + aom_read_bit(r, ACCT_STR_NAME);
+}
+
+// Decode finite subexponential code that for a symbol v in [0, n-1] with
+// parameter k
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_read_primitive_quniform(r, n - mk, ACCT_STR_NAME) + mk;
+    }
+
+    if (!aom_read_bit(r, ACCT_STR_NAME)) {
+      return aom_read_literal(r, b, ACCT_STR_NAME) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM) {
+  return inv_recenter_finite_nonneg(
+      n, ref, aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME));
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_reader.h b/third_party/aom/aom_dsp/binary_codes_reader.h
new file mode 100644
index 0000000000..d218f0619f
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_reader.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_READER_H_
+#define AOM_AOM_DSP_BINARY_CODES_READER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitreader.h"
+#include "aom_dsp/bitreader_buffer.h"
+
+#define aom_read_primitive_quniform(r, n, ACCT_STR_NAME) \
+  aom_read_primitive_quniform_(r, n ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_subexpfin(r, n, k, ACCT_STR_NAME) \
+  aom_read_primitive_subexpfin_(r, n, k ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_primitive_refsubexpfin(r, n, k, ref, ACCT_STR_NAME) \
+  aom_read_primitive_refsubexpfin_(r, n, k, ref ACCT_STR_ARG(ACCT_STR_NAME))
+
+uint16_t aom_read_primitive_quniform_(aom_reader *r, uint16_t n ACCT_STR_PARAM);
+uint16_t aom_read_primitive_subexpfin_(aom_reader *r, uint16_t n,
+                                       uint16_t k ACCT_STR_PARAM);
+uint16_t aom_read_primitive_refsubexpfin_(aom_reader *r, uint16_t n, uint16_t k,
+                                          uint16_t ref ACCT_STR_PARAM);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_READER_H_
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.c b/third_party/aom/aom_dsp/binary_codes_writer.c
new file mode 100644
index 0000000000..55ce8429d7
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.c
@@ -0,0 +1,137 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/binary_codes_writer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits].
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int abs_bits) {
+  if (v == 0) {
+    aom_write_bit(w, 0);
+  } else {
+    const int x = abs(v);
+    const int s = v < 0;
+    aom_write_bit(w, 1);
+    aom_write_bit(w, s);
+    aom_write_literal(w, x - 1, abs_bits);
+  }
+}
+
+int aom_count_primitive_symmetric(int16_t v, unsigned int abs_bits) {
+  return (v == 0 ? 1 : abs_bits + 2);
+}
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_write_literal(w, v, l - 1);
+  } else {
+    aom_write_literal(w, m + ((v - m) >> 1), l - 1);
+    aom_write_bit(w, (v - m) & 1);
+  }
+}
+
+int aom_count_primitive_quniform(uint16_t n, uint16_t v) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  return v < m ? l - 1 : l;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      aom_write_primitive_quniform(w, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_write_bit(w, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_write_literal(w, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      count += aom_count_primitive_quniform(n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+// Recenters symbol around r first and then uses a finite subexponential code.
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v) {
+  aom_write_primitive_subexpfin(w, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  aom_write_primitive_refsubexpfin(w, scaled_n, k, ref, v);
+}
+
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v) {
+  return aom_count_primitive_subexpfin(n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_count_primitive_refsubexpfin(scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/binary_codes_writer.h b/third_party/aom/aom_dsp/binary_codes_writer.h
new file mode 100644
index 0000000000..5ec8662139
--- /dev/null
+++ b/third_party/aom/aom_dsp/binary_codes_writer.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+#define AOM_AOM_DSP_BINARY_CODES_WRITER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <assert.h>
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/bitwriter.h"
+#include "aom_dsp/bitwriter_buffer.h"
+
+// Codes a symbol v in [-2^mag_bits, 2^mag_bits]
+// mag_bits is number of bits for magnitude. The alphabet is of size
+// 2 * 2^mag_bits + 1, symmetric around 0, where one bit is used to
+// indicate 0 or non-zero, mag_bits bits are used to indicate magnitide
+// and 1 more bit for the sign if non-zero.
+void aom_write_primitive_symmetric(aom_writer *w, int16_t v,
+                                   unsigned int mag_bits);
+
+// Encodes a value v in [0, n-1] quasi-uniformly
+void aom_write_primitive_quniform(aom_writer *w, uint16_t n, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+void aom_write_primitive_subexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                   uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [0, n-1] with parameter k
+// based on a reference ref also in [0, n-1].
+void aom_write_primitive_refsubexpfin(aom_writer *w, uint16_t n, uint16_t k,
+                                      uint16_t ref, uint16_t v);
+
+// Finite subexponential code that codes a symbol v in [-(n-1), n-1] with
+// parameter k based on a reference ref also in [-(n-1), n-1].
+void aom_write_signed_primitive_refsubexpfin(aom_writer *w, uint16_t n,
+                                             uint16_t k, int16_t ref,
+                                             int16_t v);
+
+// Functions that counts bits for the above primitives
+int aom_count_primitive_symmetric(int16_t v, unsigned int mag_bits);
+int aom_count_primitive_quniform(uint16_t n, uint16_t v);
+int aom_count_primitive_subexpfin(uint16_t n, uint16_t k, uint16_t v);
+int aom_count_primitive_refsubexpfin(uint16_t n, uint16_t k, uint16_t ref,
+                                     uint16_t v);
+int aom_count_signed_primitive_refsubexpfin(uint16_t n, uint16_t k, int16_t ref,
+                                            int16_t v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BINARY_CODES_WRITER_H_
diff --git a/third_party/aom/aom_dsp/bitreader.c b/third_party/aom/aom_dsp/bitreader.c
new file mode 100644
index 0000000000..4c70a91712
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/bitreader.h"
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size) {
+  if (size && !buffer) {
+    return 1;
+  }
+  r->buffer_end = buffer + size;
+  r->buffer = buffer;
+  od_ec_dec_init(&r->ec, buffer, (uint32_t)size);
+#if CONFIG_ACCOUNTING
+  r->accounting = NULL;
+#endif
+  return 0;
+}
+
+const uint8_t *aom_reader_find_begin(aom_reader *r) { return r->buffer; }
+
+const uint8_t *aom_reader_find_end(aom_reader *r) { return r->buffer_end; }
+
+uint32_t aom_reader_tell(const aom_reader *r) { return od_ec_dec_tell(&r->ec); }
+
+uint32_t aom_reader_tell_frac(const aom_reader *r) {
+  return od_ec_dec_tell_frac(&r->ec);
+}
+
+int aom_reader_has_overflowed(const aom_reader *r) {
+  const uint32_t tell_bits = aom_reader_tell(r);
+  const uint32_t tell_bytes = (tell_bits + 7) >> 3;
+  return ((ptrdiff_t)tell_bytes > r->buffer_end - r->buffer);
+}
diff --git a/third_party/aom/aom_dsp/bitreader.h b/third_party/aom/aom_dsp/bitreader.h
new file mode 100644
index 0000000000..29321f916e
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_H_
+#define AOM_AOM_DSP_BITREADER_H_
+
+#include <assert.h>
+#include <limits.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aomdx.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#if CONFIG_ACCOUNTING
+#include "av1/decoder/accounting.h"
+#define ACCT_STR_NAME acct_str
+#define ACCT_STR_PARAM , const char *ACCT_STR_NAME
+#define ACCT_STR_ARG(s) , s
+#else
+#define ACCT_STR_PARAM
+#define ACCT_STR_ARG(s)
+#endif
+
+#define aom_read(r, prob, ACCT_STR_NAME) \
+  aom_read_(r, prob ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_bit(r, ACCT_STR_NAME) \
+  aom_read_bit_(r ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_tree(r, tree, probs, ACCT_STR_NAME) \
+  aom_read_tree_(r, tree, probs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_literal(r, bits, ACCT_STR_NAME) \
+  aom_read_literal_(r, bits ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_cdf_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+#define aom_read_symbol(r, cdf, nsymbs, ACCT_STR_NAME) \
+  aom_read_symbol_(r, cdf, nsymbs ACCT_STR_ARG(ACCT_STR_NAME))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_reader {
+  const uint8_t *buffer;
+  const uint8_t *buffer_end;
+  od_ec_dec ec;
+#if CONFIG_ACCOUNTING
+  Accounting *accounting;
+#endif
+  uint8_t allow_update_cdf;
+};
+
+typedef struct aom_reader aom_reader;
+
+int aom_reader_init(aom_reader *r, const uint8_t *buffer, size_t size);
+
+const uint8_t *aom_reader_find_begin(aom_reader *r);
+
+const uint8_t *aom_reader_find_end(aom_reader *r);
+
+// Returns true if the bit reader has tried to decode more data from the buffer
+// than was actually provided.
+int aom_reader_has_overflowed(const aom_reader *r);
+
+// Returns the position in the bit reader in bits.
+uint32_t aom_reader_tell(const aom_reader *r);
+
+// Returns the position in the bit reader in 1/8th bits.
+uint32_t aom_reader_tell_frac(const aom_reader *r);
+
+#if CONFIG_ACCOUNTING
+static INLINE void aom_process_accounting(const aom_reader *r ACCT_STR_PARAM) {
+  if (r->accounting != NULL) {
+    uint32_t tell_frac;
+    tell_frac = aom_reader_tell_frac(r);
+    aom_accounting_record(r->accounting, ACCT_STR_NAME,
+                          tell_frac - r->accounting->last_tell_frac);
+    r->accounting->last_tell_frac = tell_frac;
+  }
+}
+
+static INLINE void aom_update_symb_counts(const aom_reader *r, int is_binary) {
+  if (r->accounting != NULL) {
+    r->accounting->syms.num_multi_syms += !is_binary;
+    r->accounting->syms.num_binary_syms += !!is_binary;
+  }
+}
+#endif
+
+static INLINE int aom_read_(aom_reader *r, int prob ACCT_STR_PARAM) {
+  int p = (0x7FFFFF - (prob << 15) + prob) >> 8;
+  int bit = od_ec_decode_bool_q15(&r->ec, p);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int ref_bit, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_bit, ref_cdf, &ref_nsymbs);
+    if (ref_nsymbs != 2) {
+      fprintf(stderr,
+              "\n *** [bit] nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs "
+              "%d queue_r %d\n",
+              frame_idx, 2, ref_nsymbs, queue_r);
+      assert(0);
+    }
+    if ((ref_nsymbs != 2) || (ref_cdf[0] != (aom_cdf_prob)p) ||
+        (ref_cdf[1] != 32767)) {
+      fprintf(stderr,
+              "\n *** [bit] cdf error, frame_idx_r %d cdf {%d, %d} ref_cdf {%d",
+              frame_idx, p, 32767, ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (bit != ref_bit) {
+      fprintf(stderr,
+              "\n *** [bit] symb error, frame_idx_r %d symb %d ref_symb %d "
+              "queue_r %d\n",
+              frame_idx, bit, ref_bit, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, 1);
+#endif
+  return bit;
+}
+
+static INLINE int aom_read_bit_(aom_reader *r ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read(r, 128, NULL);  // aom_prob_half
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return ret;
+}
+
+static INLINE int aom_read_literal_(aom_reader *r, int bits ACCT_STR_PARAM) {
+  int literal = 0, bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) literal |= aom_read_bit(r, NULL) << bit;
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+#endif
+  return literal;
+}
+
+static INLINE int aom_read_cdf_(aom_reader *r, const aom_cdf_prob *cdf,
+                                int nsymbs ACCT_STR_PARAM) {
+  int symb;
+  assert(cdf != NULL);
+  symb = od_ec_decode_cdf_q15(&r->ec, cdf, nsymbs);
+
+#if CONFIG_BITSTREAM_DEBUG
+  {
+    int i;
+    int cdf_error = 0;
+    int ref_symb, ref_nsymbs;
+    aom_cdf_prob ref_cdf[16];
+    const int queue_r = bitstream_queue_get_read();
+    const int frame_idx = aom_bitstream_queue_get_frame_read();
+    bitstream_queue_pop(&ref_symb, ref_cdf, &ref_nsymbs);
+    if (nsymbs != ref_nsymbs) {
+      fprintf(stderr,
+              "\n *** nsymbs error, frame_idx_r %d nsymbs %d ref_nsymbs %d "
+              "queue_r %d\n",
+              frame_idx, nsymbs, ref_nsymbs, queue_r);
+      cdf_error = 0;
+      assert(0);
+    } else {
+      for (i = 0; i < nsymbs; ++i)
+        if (cdf[i] != ref_cdf[i]) cdf_error = 1;
+    }
+    if (cdf_error) {
+      fprintf(stderr, "\n *** cdf error, frame_idx_r %d cdf {%d", frame_idx,
+              cdf[0]);
+      for (i = 1; i < nsymbs; ++i) fprintf(stderr, ", %d", cdf[i]);
+      fprintf(stderr, "} ref_cdf {%d", ref_cdf[0]);
+      for (i = 1; i < ref_nsymbs; ++i) fprintf(stderr, ", %d", ref_cdf[i]);
+      fprintf(stderr, "} queue_r %d\n", queue_r);
+      assert(0);
+    }
+    if (symb != ref_symb) {
+      fprintf(
+          stderr,
+          "\n *** symb error, frame_idx_r %d symb %d ref_symb %d queue_r %d\n",
+          frame_idx, symb, ref_symb, queue_r);
+      assert(0);
+    }
+  }
+#endif
+
+#if CONFIG_ACCOUNTING
+  if (ACCT_STR_NAME) aom_process_accounting(r, ACCT_STR_NAME);
+  aom_update_symb_counts(r, (nsymbs == 2));
+#endif
+  return symb;
+}
+
+static INLINE int aom_read_symbol_(aom_reader *r, aom_cdf_prob *cdf,
+                                   int nsymbs ACCT_STR_PARAM) {
+  int ret;
+  ret = aom_read_cdf(r, cdf, nsymbs, ACCT_STR_NAME);
+  if (r->allow_update_cdf) update_cdf(cdf, ret, nsymbs);
+  return ret;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_H_
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.c b/third_party/aom/aom_dsp/bitreader_buffer.c
new file mode 100644
index 0000000000..d79feea6a3
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.c
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitreader_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb) {
+  return (rb->bit_offset + 7) >> 3;
+}
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb) {
+  const uint32_t off = rb->bit_offset;
+  const uint32_t p = off >> 3;
+  const int q = 7 - (int)(off & 0x7);
+  if (rb->bit_buffer + p < rb->bit_buffer_end) {
+    const int bit = (rb->bit_buffer[p] >> q) & 1;
+    rb->bit_offset = off + 1;
+    return bit;
+  } else {
+    if (rb->error_handler) rb->error_handler(rb->error_handler_data);
+    return 0;
+  }
+}
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits) {
+  assert(bits <= 31);
+  int value = 0, bit;
+  for (bit = bits - 1; bit >= 0; bit--) value |= aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb,
+                                      int bits) {
+  assert(bits <= 32);
+  uint32_t value = 0;
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    value |= (uint32_t)aom_rb_read_bit(rb) << bit;
+  return value;
+}
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits) {
+  const int nbits = sizeof(unsigned) * 8 - bits - 1;
+  const unsigned value = (unsigned)aom_rb_read_literal(rb, bits + 1) << nbits;
+  return ((int)value) >> nbits;
+}
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb) {
+  int leading_zeros = 0;
+  while (leading_zeros < 32 && !aom_rb_read_bit(rb)) ++leading_zeros;
+  // Maximum 32 bits.
+  if (leading_zeros == 32) return UINT32_MAX;
+  const uint32_t base = (1u << leading_zeros) - 1;
+  const uint32_t value = aom_rb_read_literal(rb, leading_zeros);
+  return base + value;
+}
+
+static uint16_t aom_rb_read_primitive_quniform(struct aom_read_bit_buffer *rb,
+                                               uint16_t n) {
+  if (n <= 1) return 0;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  const int v = aom_rb_read_literal(rb, l - 1);
+  return v < m ? v : (v << 1) - m + aom_rb_read_bit(rb);
+}
+
+static uint16_t aom_rb_read_primitive_subexpfin(struct aom_read_bit_buffer *rb,
+                                                uint16_t n, uint16_t k) {
+  int i = 0;
+  int mk = 0;
+
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+
+    if (n <= mk + 3 * a) {
+      return aom_rb_read_primitive_quniform(rb, n - mk) + mk;
+    }
+
+    if (!aom_rb_read_bit(rb)) {
+      return aom_rb_read_literal(rb, b) + mk;
+    }
+
+    i = i + 1;
+    mk += a;
+  }
+
+  assert(0);
+  return 0;
+}
+
+static uint16_t aom_rb_read_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, uint16_t ref) {
+  return inv_recenter_finite_nonneg(n, ref,
+                                    aom_rb_read_primitive_subexpfin(rb, n, k));
+}
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref) {
+  ref += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  return aom_rb_read_primitive_refsubexpfin(rb, scaled_n, k, ref) - n + 1;
+}
diff --git a/third_party/aom/aom_dsp/bitreader_buffer.h b/third_party/aom/aom_dsp/bitreader_buffer.h
new file mode 100644
index 0000000000..359fbe5194
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitreader_buffer.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITREADER_BUFFER_H_
+#define AOM_AOM_DSP_BITREADER_BUFFER_H_
+
+#include <limits.h>
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*aom_rb_error_handler)(void *data);
+
+struct aom_read_bit_buffer {
+  const uint8_t *bit_buffer;
+  const uint8_t *bit_buffer_end;
+  uint32_t bit_offset;
+
+  void *error_handler_data;
+  aom_rb_error_handler error_handler;
+};
+
+size_t aom_rb_bytes_read(const struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_bit(struct aom_read_bit_buffer *rb);
+
+int aom_rb_read_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_unsigned_literal(struct aom_read_bit_buffer *rb, int bits);
+
+int aom_rb_read_inv_signed_literal(struct aom_read_bit_buffer *rb, int bits);
+
+uint32_t aom_rb_read_uvlc(struct aom_read_bit_buffer *rb);
+
+int16_t aom_rb_read_signed_primitive_refsubexpfin(
+    struct aom_read_bit_buffer *rb, uint16_t n, uint16_t k, int16_t ref);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITREADER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter.c b/third_party/aom/aom_dsp/bitwriter.c
new file mode 100644
index 0000000000..4c27bb1fc3
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <string.h>
+#include "aom_dsp/bitwriter.h"
+
+void aom_start_encode(aom_writer *w, uint8_t *source) {
+  w->buffer = source;
+  w->pos = 0;
+  od_ec_enc_init(&w->ec, 62025);
+}
+
+int aom_stop_encode(aom_writer *w) {
+  int nb_bits;
+  uint32_t bytes;
+  unsigned char *data;
+  data = od_ec_enc_done(&w->ec, &bytes);
+  if (!data) {
+    od_ec_enc_clear(&w->ec);
+    return -1;
+  }
+  nb_bits = od_ec_enc_tell(&w->ec);
+  memcpy(w->buffer, data, bytes);
+  w->pos = bytes;
+  od_ec_enc_clear(&w->ec);
+  return nb_bits;
+}
+
+int aom_tell_size(aom_writer *w) {
+  const int nb_bits = od_ec_enc_tell(&w->ec);
+  return nb_bits;
+}
diff --git a/third_party/aom/aom_dsp/bitwriter.h b/third_party/aom/aom_dsp/bitwriter.h
new file mode 100644
index 0000000000..6aedd8ceb9
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_H_
+#define AOM_AOM_DSP_BITWRITER_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if CONFIG_RD_DEBUG
+#include "av1/common/blockd.h"
+#include "av1/encoder/cost.h"
+#endif
+
+#if CONFIG_BITSTREAM_DEBUG
+#include "aom_util/debug_util.h"
+#endif  // CONFIG_BITSTREAM_DEBUG
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_writer {
+  unsigned int pos;
+  uint8_t *buffer;
+  od_ec_enc ec;
+  uint8_t allow_update_cdf;
+};
+
+typedef struct aom_writer aom_writer;
+
+typedef struct TOKEN_STATS {
+  int cost;
+#if CONFIG_RD_DEBUG
+  int txb_coeff_cost_map[TXB_COEFF_COST_MAP_SIZE][TXB_COEFF_COST_MAP_SIZE];
+#endif
+} TOKEN_STATS;
+
+static INLINE void init_token_stats(TOKEN_STATS *token_stats) {
+#if CONFIG_RD_DEBUG
+  int r, c;
+  for (r = 0; r < TXB_COEFF_COST_MAP_SIZE; ++r) {
+    for (c = 0; c < TXB_COEFF_COST_MAP_SIZE; ++c) {
+      token_stats->txb_coeff_cost_map[r][c] = 0;
+    }
+  }
+#endif
+  token_stats->cost = 0;
+}
+
+void aom_start_encode(aom_writer *w, uint8_t *buffer);
+
+// Returns a negative number on error. Caller must check the return value and
+// handle error.
+int aom_stop_encode(aom_writer *w);
+
+int aom_tell_size(aom_writer *w);
+
+static INLINE void aom_write(aom_writer *w, int bit, int probability) {
+  int p = (0x7FFFFF - (probability << 15) + probability) >> 8;
+#if CONFIG_BITSTREAM_DEBUG
+  aom_cdf_prob cdf[2] = { (aom_cdf_prob)p, 32767 };
+  bitstream_queue_push(bit, cdf, 2);
+#endif
+
+  od_ec_encode_bool_q15(&w->ec, bit, p);
+}
+
+static INLINE void aom_write_bit(aom_writer *w, int bit) {
+  aom_write(w, bit, 128);  // aom_prob_half
+}
+
+static INLINE void aom_write_literal(aom_writer *w, int data, int bits) {
+  int bit;
+
+  for (bit = bits - 1; bit >= 0; bit--) aom_write_bit(w, 1 & (data >> bit));
+}
+
+static INLINE void aom_write_cdf(aom_writer *w, int symb,
+                                 const aom_cdf_prob *cdf, int nsymbs) {
+#if CONFIG_BITSTREAM_DEBUG
+  bitstream_queue_push(symb, cdf, nsymbs);
+#endif
+
+  od_ec_encode_cdf_q15(&w->ec, symb, cdf, nsymbs);
+}
+
+static INLINE void aom_write_symbol(aom_writer *w, int symb, aom_cdf_prob *cdf,
+                                    int nsymbs) {
+  aom_write_cdf(w, symb, cdf, nsymbs);
+  if (w->allow_update_cdf) update_cdf(cdf, symb, nsymbs);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_H_
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.c b/third_party/aom/aom_dsp/bitwriter_buffer.c
new file mode 100644
index 0000000000..7d0ab9486a
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <limits.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/bitwriter_buffer.h"
+#include "aom_dsp/recenter.h"
+#include "aom_ports/bitops.h"
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb) {
+  return (wb->bit_offset % CHAR_BIT == 0);
+}
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb) {
+  return wb->bit_offset / CHAR_BIT + (wb->bit_offset % CHAR_BIT > 0);
+}
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit) {
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  if (q == CHAR_BIT - 1) {
+    // Zero next char and write bit
+    wb->bit_buffer[p] = bit << q;
+  } else {
+    wb->bit_buffer[p] &= ~(1 << q);
+    wb->bit_buffer[p] |= bit << q;
+  }
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit) {
+  // Do not zero bytes but overwrite exisiting values
+  const int off = (int)wb->bit_offset;
+  const int p = off / CHAR_BIT;
+  const int q = CHAR_BIT - 1 - off % CHAR_BIT;
+  wb->bit_buffer[p] &= ~(1 << q);
+  wb->bit_buffer[p] |= bit << q;
+  wb->bit_offset = off + 1;
+}
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits) {
+  assert(bits <= 31);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits) {
+  assert(bits <= 32);
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--) aom_wb_write_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits) {
+  int bit;
+  for (bit = bits - 1; bit >= 0; bit--)
+    aom_wb_overwrite_bit(wb, (data >> bit) & 1);
+}
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits) {
+  aom_wb_write_literal(wb, data, bits + 1);
+}
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v) {
+  int64_t shift_val = ++v;
+  int leading_zeroes = 1;
+
+  assert(shift_val > 0);
+
+  while (shift_val >>= 1) leading_zeroes += 2;
+
+  aom_wb_write_literal(wb, 0, leading_zeroes >> 1);
+  aom_wb_write_unsigned_literal(wb, v, (leading_zeroes + 1) >> 1);
+}
+
+static void wb_write_primitive_quniform(struct aom_write_bit_buffer *wb,
+                                        uint16_t n, uint16_t v) {
+  if (n <= 1) return;
+  const int l = get_msb(n) + 1;
+  const int m = (1 << l) - n;
+  if (v < m) {
+    aom_wb_write_literal(wb, v, l - 1);
+  } else {
+    aom_wb_write_literal(wb, m + ((v - m) >> 1), l - 1);
+    aom_wb_write_bit(wb, (v - m) & 1);
+  }
+}
+
+static void wb_write_primitive_subexpfin(struct aom_write_bit_buffer *wb,
+                                         uint16_t n, uint16_t k, uint16_t v) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (n <= mk + 3 * a) {
+      wb_write_primitive_quniform(wb, n - mk, v - mk);
+      break;
+    } else {
+      int t = (v >= mk + a);
+      aom_wb_write_bit(wb, t);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        aom_wb_write_literal(wb, v - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+static void wb_write_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                            uint16_t n, uint16_t k,
+                                            uint16_t ref, uint16_t v) {
+  wb_write_primitive_subexpfin(wb, n, k, recenter_finite_nonneg(n, ref, v));
+}
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v) {
+  ref += n - 1;
+  v += n - 1;
+  const uint16_t scaled_n = (n << 1) - 1;
+  wb_write_primitive_refsubexpfin(wb, scaled_n, k, ref, v);
+}
diff --git a/third_party/aom/aom_dsp/bitwriter_buffer.h b/third_party/aom/aom_dsp/bitwriter_buffer.h
new file mode 100644
index 0000000000..fd10e01bb7
--- /dev/null
+++ b/third_party/aom/aom_dsp/bitwriter_buffer.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BITWRITER_BUFFER_H_
+#define AOM_AOM_DSP_BITWRITER_BUFFER_H_
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct aom_write_bit_buffer {
+  uint8_t *bit_buffer;
+  uint32_t bit_offset;
+};
+
+int aom_wb_is_byte_aligned(const struct aom_write_bit_buffer *wb);
+
+uint32_t aom_wb_bytes_written(const struct aom_write_bit_buffer *wb);
+
+void aom_wb_write_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_overwrite_bit(struct aom_write_bit_buffer *wb, int bit);
+
+void aom_wb_write_literal(struct aom_write_bit_buffer *wb, int data, int bits);
+
+void aom_wb_write_unsigned_literal(struct aom_write_bit_buffer *wb,
+                                   uint32_t data, int bits);
+
+void aom_wb_overwrite_literal(struct aom_write_bit_buffer *wb, int data,
+                              int bits);
+
+void aom_wb_write_inv_signed_literal(struct aom_write_bit_buffer *wb, int data,
+                                     int bits);
+
+void aom_wb_write_uvlc(struct aom_write_bit_buffer *wb, uint32_t v);
+
+void aom_wb_write_signed_primitive_refsubexpfin(struct aom_write_bit_buffer *wb,
+                                                uint16_t n, uint16_t k,
+                                                int16_t ref, int16_t v);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_BITWRITER_BUFFER_H_
diff --git a/third_party/aom/aom_dsp/blend.h b/third_party/aom/aom_dsp/blend.h
new file mode 100644
index 0000000000..fd87dc1810
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend.h
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BLEND_H_
+#define AOM_AOM_DSP_BLEND_H_
+
+#include "aom_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the aom_blend_* functions in aom_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+
+#define AOM_BLEND_A64_ROUND_BITS 6
+#define AOM_BLEND_A64_MAX_ALPHA (1 << AOM_BLEND_A64_ROUND_BITS)  // 64
+
+#define AOM_BLEND_A64(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define AOM_BLEND_A256_ROUND_BITS 8
+#define AOM_BLEND_A256_MAX_ALPHA (1 << AOM_BLEND_A256_ROUND_BITS)  // 256
+
+#define AOM_BLEND_A256(a, v0, v1)                                          \
+  ROUND_POWER_OF_TWO((a) * (v0) + (AOM_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+                     AOM_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define AOM_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#define DIFF_FACTOR_LOG2 4
+#define DIFF_FACTOR (1 << DIFF_FACTOR_LOG2)
+
+#endif  // AOM_AOM_DSP_BLEND_H_
diff --git a/third_party/aom/aom_dsp/blend_a64_hmask.c b/third_party/aom/aom_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000000..e9e38ef969
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_hmask.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_hmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(
+          mask[j], src0[i * src0_stride + j], src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/blend_a64_mask.c b/third_party/aom/aom_dsp/blend_a64_mask.c
new file mode 100644
index 0000000000..35017fd737
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_mask.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+// NOTE(rachelbarker): The input and output of aom_blend_a64_d16_mask_c() are
+// in a higher intermediate precision, and will later be rounded down to pixel
+// precision.
+// Thus, in order to avoid double-rounding, we want to use normal right shifts
+// within this function, not ROUND_POWER_OF_TWO.
+// This works because of the identity:
+// ROUND_POWER_OF_TWO(x >> y, z) == ROUND_POWER_OF_TWO(x, y+z)
+//
+// In contrast, the output of the non-d16 functions will not be further rounded,
+// so we *should* use ROUND_POWER_OF_TWO there.
+
+void aom_lowbd_blend_a64_d16_mask_c(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  int i, j;
+  const int bd = 8;
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[i * mask_stride + j];
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        res = ((m * (int32_t)src0[i * src0_stride + j] +
+                (AOM_BLEND_A64_MAX_ALPHA - m) *
+                    (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        res = ((int32_t)(m * (int32_t)src0[i * src0_stride + j] +
+                         (AOM_BLEND_A64_MAX_ALPHA - m) *
+                             (int32_t)src1[i * src1_stride + j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        dst[i * dst_stride + j] =
+            clip_pixel(ROUND_POWER_OF_TWO(res, round_bits));
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_d16_mask_c(
+    uint8_t *dst_8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  const int round_offset = (1 << (offset_bits - conv_params->round_1)) +
+                           (1 << (offset_bits - conv_params->round_1 - 1));
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  // excerpt from clip_pixel_highbd()
+  // set saturation_value to (1 << bd) - 1
+  unsigned int saturation_value;
+  switch (bd) {
+    case 8:
+    default: saturation_value = 255; break;
+    case 10: saturation_value = 1023; break;
+    case 12: saturation_value = 4095; break;
+  }
+
+  if (subw == 0 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = mask[j];
+        res = ((m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+               AOM_BLEND_A64_ROUND_BITS);
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = ROUND_POWER_OF_TWO(
+            mask[2 * j] + mask[mask_stride + 2 * j] + mask[2 * j + 1] +
+                mask[mask_stride + 2 * j + 1],
+            2);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[2 * j], mask[2 * j + 1]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  } else {
+    for (int i = 0; i < h; ++i) {
+      for (int j = 0; j < w; ++j) {
+        int32_t res;
+        const int m = AOM_BLEND_AVG(mask[j], mask[mask_stride + j]);
+        res = (m * src0[j] + (AOM_BLEND_A64_MAX_ALPHA - m) * src1[j]) >>
+              AOM_BLEND_A64_ROUND_BITS;
+        res -= round_offset;
+        unsigned int v = negative_to_zero(ROUND_POWER_OF_TWO(res, round_bits));
+        dst[j] = AOMMIN(v, saturation_value);
+      }
+      mask += 2 * mask_stride;
+      src0 += src0_stride;
+      src1 += src1_stride;
+      dst += dst_stride;
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for AOM_BLEND_A64 in aom_dsp/blend.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void aom_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride, int w,
+                          int h, int subw, int subh) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int w, int h, int subw, int subh, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = ROUND_POWER_OF_TWO(
+            mask[(2 * i) * mask_stride + (2 * j)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+            2);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = AOM_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/blend_a64_vmask.c b/third_party/aom/aom_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000000..c938bb33af
--- /dev/null
+++ b/third_party/aom/aom_dsp/blend_a64_vmask.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_blend_a64_vmask_c(uint8_t *dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_vmask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                  const uint8_t *src0_8, uint32_t src0_stride,
+                                  const uint8_t *src1_8, uint32_t src1_stride,
+                                  const uint8_t *mask, int w, int h, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+  (void)bd;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = AOM_BLEND_A64(m, src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/blk_sse_sum.c b/third_party/aom/aom_dsp/blk_sse_sum.c
new file mode 100644
index 0000000000..d76c3f87b9
--- /dev/null
+++ b/third_party/aom/aom_dsp/blk_sse_sum.c
@@ -0,0 +1,26 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_dsp_rtcd.h"
+
+void aom_get_blk_sse_sum_c(const int16_t *data, int stride, int bw, int bh,
+                           int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+  for (int i = 0; i < bh; ++i) {
+    for (int j = 0; j < bw; ++j) {
+      const int val = data[j];
+      *x_sum += val;
+      *x2_sum += val * val;
+    }
+    data += stride;
+  }
+}
diff --git a/third_party/aom/aom_dsp/butteraugli.c b/third_party/aom/aom_dsp/butteraugli.c
new file mode 100644
index 0000000000..8d2a29f7a3
--- /dev/null
+++ b/third_party/aom/aom_dsp/butteraugli.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <jxl/butteraugli.h>
+
+#include "aom_dsp/butteraugli.h"
+#include "aom_mem/aom_mem.h"
+#include "third_party/libyuv/include/libyuv/convert_argb.h"
+
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map) {
+  (void)bit_depth;
+  assert(bit_depth == 8);
+  const int width = source->y_crop_width;
+  const int height = source->y_crop_height;
+  const int ss_x = source->subsampling_x;
+  const int ss_y = source->subsampling_y;
+
+  const struct YuvConstants *yuv_constants;
+  if (matrix_coefficients == AOM_CICP_MC_BT_709) {
+    if (color_range == AOM_CR_FULL_RANGE) return 0;
+    yuv_constants = &kYuvH709Constants;
+  } else {
+    yuv_constants = color_range == AOM_CR_FULL_RANGE ? &kYuvJPEGConstants
+                                                     : &kYuvI601Constants;
+  }
+
+  const int stride_argb = width * 4;
+  const size_t buffer_size = (size_t)height * stride_argb;
+  uint8_t *src_argb = (uint8_t *)aom_malloc(buffer_size);
+  uint8_t *distorted_argb = (uint8_t *)aom_malloc(buffer_size);
+  if (!src_argb || !distorted_argb) {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  if (ss_x == 1 && ss_y == 1) {
+    I420ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I420ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 1 && ss_y == 0) {
+    I422ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I422ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else if (ss_x == 0 && ss_y == 0) {
+    I444ToARGBMatrix(source->y_buffer, source->y_stride, source->u_buffer,
+                     source->uv_stride, source->v_buffer, source->uv_stride,
+                     src_argb, stride_argb, yuv_constants, width, height);
+    I444ToARGBMatrix(distorted->y_buffer, distorted->y_stride,
+                     distorted->u_buffer, distorted->uv_stride,
+                     distorted->v_buffer, distorted->uv_stride, distorted_argb,
+                     stride_argb, yuv_constants, width, height);
+  } else {
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  JxlPixelFormat pixel_format = { 4, JXL_TYPE_UINT8, JXL_NATIVE_ENDIAN, 0 };
+  JxlButteraugliApi *api = JxlButteraugliApiCreate(NULL);
+  JxlButteraugliApiSetHFAsymmetry(api, 0.8f);
+
+  JxlButteraugliResult *result = JxlButteraugliCompute(
+      api, width, height, &pixel_format, src_argb, buffer_size, &pixel_format,
+      distorted_argb, buffer_size);
+
+  const float *distmap = NULL;
+  uint32_t row_stride;
+  JxlButteraugliResultGetDistmap(result, &distmap, &row_stride);
+  if (distmap == NULL) {
+    JxlButteraugliApiDestroy(api);
+    JxlButteraugliResultDestroy(result);
+    aom_free(src_argb);
+    aom_free(distorted_argb);
+    return 0;
+  }
+
+  for (int j = 0; j < height; ++j) {
+    for (int i = 0; i < width; ++i) {
+      dist_map[j * width + i] = distmap[j * row_stride + i];
+    }
+  }
+
+  JxlButteraugliApiDestroy(api);
+  JxlButteraugliResultDestroy(result);
+  aom_free(src_argb);
+  aom_free(distorted_argb);
+  return 1;
+}
diff --git a/third_party/aom/aom_dsp/butteraugli.h b/third_party/aom/aom_dsp/butteraugli.h
new file mode 100644
index 0000000000..5304092ccb
--- /dev/null
+++ b/third_party/aom/aom_dsp/butteraugli.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_BUTTERAUGLI_H_
+#define AOM_AOM_DSP_BUTTERAUGLI_H_
+
+#include "aom_scale/yv12config.h"
+
+// Returns a boolean that indicates success/failure.
+int aom_calc_butteraugli(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         aom_matrix_coefficients_t matrix_coefficients,
+                         aom_color_range_t color_range, float *dist_map);
+
+#endif  // AOM_AOM_DSP_BUTTERAUGLI_H_
diff --git a/third_party/aom/aom_dsp/entcode.c b/third_party/aom/aom_dsp/entcode.c
new file mode 100644
index 0000000000..aad96c6fc6
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.c
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/entcode.h"
+
+/*Given the current total integer number of bits used and the current value of
+   rng, computes the fraction number of bits used to OD_BITRES precision.
+  This is used by od_ec_enc_tell_frac() and od_ec_dec_tell_frac().
+  nbits_total: The number of whole bits currently used, i.e., the value
+                returned by od_ec_enc_tell() or od_ec_dec_tell().
+  rng: The current value of rng from either the encoder or decoder state.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_tell_frac(uint32_t nbits_total, uint32_t rng) {
+  uint32_t nbits;
+  int l;
+  int i;
+  /*To handle the non-integral number of bits still left in the encoder/decoder
+     state, we compute the worst-case number of bits of val that must be
+     encoded to ensure that the value is inside the range for any possible
+     subsequent bits.
+    The computation here is independent of val itself (the decoder does not
+     even track that value), even though the real number of bits used after
+     od_ec_enc_done() may be 1 smaller if rng is a power of two and the
+     corresponding trailing bits of val are all zeros.
+    If we did try to track that special case, then coding a value with a
+     probability of 1/(1 << n) might sometimes appear to use more than n bits.
+    This may help explain the surprising result that a newly initialized
+     encoder or decoder claims to have used 1 bit.*/
+  nbits = nbits_total << OD_BITRES;
+  l = 0;
+  for (i = OD_BITRES; i-- > 0;) {
+    int b;
+    rng = rng * rng >> 15;
+    b = (int)(rng >> 16);
+    l = l << 1 | b;
+    rng >>= b;
+  }
+  return nbits - l;
+}
diff --git a/third_party/aom/aom_dsp/entcode.h b/third_party/aom/aom_dsp/entcode.h
new file mode 100644
index 0000000000..526ca598d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/entcode.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTCODE_H_
+#define AOM_AOM_DSP_ENTCODE_H_
+
+#include <limits.h>
+#include <stddef.h>
+#include "aom_dsp/odintrin.h"
+#include "aom_dsp/prob.h"
+
+#define EC_PROB_SHIFT 6
+#define EC_MIN_PROB 4  // must be <= (1<<EC_PROB_SHIFT)/16
+
+/*OPT: od_ec_window must be at least 32 bits, but if you have fast arithmetic
+   on a larger type, you can speed up the decoder by using it here.*/
+typedef uint32_t od_ec_window;
+
+/*The size in bits of od_ec_window.*/
+#define OD_EC_WINDOW_SIZE ((int)sizeof(od_ec_window) * CHAR_BIT)
+
+/*The resolution of fractional-precision bit usage measurements, i.e.,
+   3 => 1/8th bits.*/
+#define OD_BITRES (3)
+
+#define OD_ICDF AOM_ICDF
+
+/*See entcode.c for further documentation.*/
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_tell_frac(uint32_t nbits_total,
+                                               uint32_t rng);
+
+#endif  // AOM_AOM_DSP_ENTCODE_H_
diff --git a/third_party/aom/aom_dsp/entdec.c b/third_party/aom/aom_dsp/entdec.c
new file mode 100644
index 0000000000..5bbcddae08
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.c
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/entdec.h"
+#include "aom_dsp/prob.h"
+
+/*A range decoder.
+  This is an entropy decoder based upon \cite{Mar79}, which is itself a
+   rediscovery of the FIFO arithmetic code introduced by \cite{Pas76}.
+  It is very similar to arithmetic encoding, except that encoding is done with
+   digits in any base, instead of with bits, and so it is faster when using
+   larger bases (i.e.: a byte).
+  The author claims an average waste of $\frac{1}{2}\log_b(2b)$ bits, where $b$
+   is the base, longer than the theoretical optimum, but to my knowledge there
+   is no published justification for this claim.
+  This only seems true when using near-infinite precision arithmetic so that
+   the process is carried out with no rounding errors.
+
+  An excellent description of implementation details is available at
+   http://www.arturocampos.com/ac_range.html
+  A recent work \cite{MNW98} which proposes several changes to arithmetic
+   encoding for efficiency actually re-discovers many of the principles
+   behind range encoding, and presents a good theoretical analysis of them.
+
+  End of stream is handled by writing out the smallest number of bits that
+   ensures that the stream will be correctly decoded regardless of the value of
+   any subsequent bits.
+  od_ec_dec_tell() can be used to determine how many bits were needed to decode
+   all the symbols thus far; other data can be packed in the remaining bits of
+   the input buffer.
+  @PHDTHESIS{Pas76,
+    author="Richard Clark Pasco",
+    title="Source coding algorithms for fast data compression",
+    school="Dept. of Electrical Engineering, Stanford University",
+    address="Stanford, CA",
+    month=May,
+    year=1976,
+    URL="http://www.richpasco.org/scaffdc.pdf"
+  }
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video & Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*This is meant to be a large, positive constant that can still be efficiently
+   loaded as an immediate (on platforms like ARM, for example).
+  Even relatively modest values like 100 would work fine.*/
+#define OD_EC_LOTS_OF_BITS (0x4000)
+
+/*The return value of od_ec_dec_tell does not change across an od_ec_dec_refill
+   call.*/
+static void od_ec_dec_refill(od_ec_dec *dec) {
+  int s;
+  od_ec_window dif;
+  int16_t cnt;
+  const unsigned char *bptr;
+  const unsigned char *end;
+  dif = dec->dif;
+  cnt = dec->cnt;
+  bptr = dec->bptr;
+  end = dec->end;
+  s = OD_EC_WINDOW_SIZE - 9 - (cnt + 15);
+  for (; s >= 0 && bptr < end; s -= 8, bptr++) {
+    /*Each time a byte is inserted into the window (dif), bptr advances and cnt
+       is incremented by 8, so the total number of consumed bits (the return
+       value of od_ec_dec_tell) does not change.*/
+    assert(s <= OD_EC_WINDOW_SIZE - 8);
+    dif ^= (od_ec_window)bptr[0] << s;
+    cnt += 8;
+  }
+  if (bptr >= end) {
+    /*We've reached the end of the buffer. It is perfectly valid for us to need
+       to fill the window with additional bits past the end of the buffer (and
+       this happens in normal operation). These bits should all just be taken
+       as zero. But we cannot increment bptr past 'end' (this is undefined
+       behavior), so we start to increment dec->tell_offs. We also don't want
+       to keep testing bptr against 'end', so we set cnt to OD_EC_LOTS_OF_BITS
+       and adjust dec->tell_offs so that the total number of unconsumed bits in
+       the window (dec->cnt - dec->tell_offs) does not change. This effectively
+       puts lots of zero bits into the window, and means we won't try to refill
+       it from the buffer for a very long time (at which point we'll put lots
+       of zero bits into the window again).*/
+    dec->tell_offs += OD_EC_LOTS_OF_BITS - cnt;
+    cnt = OD_EC_LOTS_OF_BITS;
+  }
+  dec->dif = dif;
+  dec->cnt = cnt;
+  dec->bptr = bptr;
+}
+
+/*Takes updated dif and range values, renormalizes them so that
+   32768 <= rng < 65536 (reading more bytes from the stream into dif if
+   necessary), and stores them back in the decoder context.
+  dif: The new value of dif.
+  rng: The new value of the range.
+  ret: The value to return.
+  Return: ret.
+          This allows the compiler to jump to this function via a tail-call.*/
+static int od_ec_dec_normalize(od_ec_dec *dec, od_ec_window dif, unsigned rng,
+                               int ret) {
+  int d;
+  assert(rng <= 65535U);
+  /*The number of leading zeros in the 16-bit binary representation of rng.*/
+  d = 16 - OD_ILOG_NZ(rng);
+  /*d bits in dec->dif are consumed.*/
+  dec->cnt -= d;
+  /*This is equivalent to shifting in 1's instead of 0's.*/
+  dec->dif = ((dif + 1) << d) - 1;
+  dec->rng = rng << d;
+  if (dec->cnt < 0) od_ec_dec_refill(dec);
+  return ret;
+}
+
+/*Initializes the decoder.
+  buf: The input buffer to use.
+  storage: The size in bytes of the input buffer.*/
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf,
+                    uint32_t storage) {
+  dec->buf = buf;
+  dec->tell_offs = 10 - (OD_EC_WINDOW_SIZE - 8);
+  dec->end = buf + storage;
+  dec->bptr = buf;
+  dec->dif = ((od_ec_window)1 << (OD_EC_WINDOW_SIZE - 1)) - 1;
+  dec->rng = 0x8000;
+  dec->cnt = -15;
+  od_ec_dec_refill(dec);
+}
+
+/*Decode a single binary value.
+  f: The probability that the bit is one, scaled by 32768.
+  Return: The value decoded (0 or 1).*/
+int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f) {
+  od_ec_window dif;
+  od_ec_window vw;
+  unsigned r;
+  unsigned r_new;
+  unsigned v;
+  int ret;
+  assert(0 < f);
+  assert(f < 32768U);
+  dif = dec->dif;
+  r = dec->rng;
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  vw = (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  ret = 1;
+  r_new = v;
+  if (dif >= vw) {
+    r_new = r - v;
+    dif -= vw;
+    ret = 0;
+  }
+  return od_ec_dec_normalize(dec, dif, r_new, ret);
+}
+
+/*Decodes a symbol given an inverse cumulative distribution function (CDF)
+   table in Q15.
+  icdf: CDF_PROB_TOP minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (CDF_PROB_TOP - icdf[s - 1]) : 0, CDF_PROB_TOP - icdf[s]).
+        The values must be monotonically non-increasing, and icdf[nsyms - 1]
+         must be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.
+  Return: The decoded symbol s.*/
+int od_ec_decode_cdf_q15(od_ec_dec *dec, const uint16_t *icdf, int nsyms) {
+  od_ec_window dif;
+  unsigned r;
+  unsigned c;
+  unsigned u;
+  unsigned v;
+  int ret;
+  (void)nsyms;
+  dif = dec->dif;
+  r = dec->rng;
+  const int N = nsyms - 1;
+
+  assert(dif >> (OD_EC_WINDOW_SIZE - 16) < r);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  assert(32768U <= r);
+  assert(7 - EC_PROB_SHIFT >= 0);
+  c = (unsigned)(dif >> (OD_EC_WINDOW_SIZE - 16));
+  v = r;
+  ret = -1;
+  do {
+    u = v;
+    v = ((r >> 8) * (uint32_t)(icdf[++ret] >> EC_PROB_SHIFT) >>
+         (7 - EC_PROB_SHIFT));
+    v += EC_MIN_PROB * (N - ret);
+  } while (c < v);
+  assert(v < u);
+  assert(u <= r);
+  r = u - v;
+  dif -= (od_ec_window)v << (OD_EC_WINDOW_SIZE - 16);
+  return od_ec_dec_normalize(dec, dif, r, ret);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_dec_tell(const od_ec_dec *dec) {
+  /*There is a window of bits stored in dec->dif. The difference
+     (dec->bptr - dec->buf) tells us how many bytes have been read into this
+     window. The difference (dec->cnt - dec->tell_offs) tells us how many of
+     the bits in that window remain unconsumed.*/
+  return (int)((dec->bptr - dec->buf) * 8 - dec->cnt + dec->tell_offs);
+}
+
+/*Returns the number of bits "used" by the decoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec) {
+  return od_ec_tell_frac(od_ec_dec_tell(dec), dec->rng);
+}
diff --git a/third_party/aom/aom_dsp/entdec.h b/third_party/aom/aom_dsp/entdec.h
new file mode 100644
index 0000000000..c746167775
--- /dev/null
+++ b/third_party/aom/aom_dsp/entdec.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTDEC_H_
+#define AOM_AOM_DSP_ENTDEC_H_
+#include <limits.h>
+#include "aom_dsp/entcode.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct od_ec_dec od_ec_dec;
+
+#if defined(OD_ACCOUNTING) && OD_ACCOUNTING
+#define OD_ACC_STR , char *acc_str
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb, str)
+#else
+#define OD_ACC_STR
+#define od_ec_dec_bits(dec, ftb, str) od_ec_dec_bits_(dec, ftb)
+#endif
+
+/*The entropy decoder context.*/
+struct od_ec_dec {
+  /*The start of the current input buffer.*/
+  const unsigned char *buf;
+  /*An offset used to keep track of tell after reaching the end of the stream.
+    This is constant throughout most of the decoding process, but becomes
+     important once we hit the end of the buffer and stop incrementing bptr
+     (and instead pretend cnt has lots of bits).*/
+  int32_t tell_offs;
+  /*The end of the current input buffer.*/
+  const unsigned char *end;
+  /*The read pointer for the entropy-coded bits.*/
+  const unsigned char *bptr;
+  /*The difference between the high end of the current range, (low + rng), and
+     the coded value, minus 1.
+    This stores up to OD_EC_WINDOW_SIZE bits of that difference, but the
+     decoder only uses the top 16 bits of the window to decode the next symbol.
+    As we shift up during renormalization, if we don't have enough bits left in
+     the window to fill the top 16, we'll read in more bits of the coded
+     value.*/
+  od_ec_window dif;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+};
+
+/*See entdec.c for further documentation.*/
+
+void od_ec_dec_init(od_ec_dec *dec, const unsigned char *buf, uint32_t storage)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_decode_bool_q15(od_ec_dec *dec, unsigned f)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT int od_ec_decode_cdf_q15(od_ec_dec *dec,
+                                               const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_bits_(od_ec_dec *dec, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+OD_WARN_UNUSED_RESULT int od_ec_dec_tell(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_dec_tell_frac(const od_ec_dec *dec)
+    OD_ARG_NONNULL(1);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTDEC_H_
diff --git a/third_party/aom/aom_dsp/entenc.c b/third_party/aom/aom_dsp/entenc.c
new file mode 100644
index 0000000000..591e0ad214
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.c
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <assert.h>
+#include "aom_dsp/entenc.h"
+#include "aom_dsp/prob.h"
+
+#if OD_MEASURE_EC_OVERHEAD
+#if !defined(M_LOG2E)
+#define M_LOG2E (1.4426950408889634073599246810019)
+#endif
+#define OD_LOG2(x) (M_LOG2E * log(x))
+#endif  // OD_MEASURE_EC_OVERHEAD
+
+/*A range encoder.
+  See entdec.c and the references for implementation details \cite{Mar79,MNW98}.
+
+  @INPROCEEDINGS{Mar79,
+   author="Martin, G.N.N.",
+   title="Range encoding: an algorithm for removing redundancy from a digitised
+    message",
+   booktitle="Video \& Data Recording Conference",
+   year=1979,
+   address="Southampton",
+   month=Jul,
+   URL="http://www.compressconsult.com/rangecoder/rngcod.pdf.gz"
+  }
+  @ARTICLE{MNW98,
+   author="Alistair Moffat and Radford Neal and Ian H. Witten",
+   title="Arithmetic Coding Revisited",
+   journal="{ACM} Transactions on Information Systems",
+   year=1998,
+   volume=16,
+   number=3,
+   pages="256--294",
+   month=Jul,
+   URL="http://researchcommons.waikato.ac.nz/bitstream/handle/10289/78/content.pdf"
+  }*/
+
+/*Takes updated low and range values, renormalizes them so that
+   32768 <= rng < 65536 (flushing bytes from low to the output buffer if
+   necessary), and stores them back in the encoder context.
+  low: The new value of low.
+  rng: The new value of the range.*/
+static void od_ec_enc_normalize(od_ec_enc *enc, od_ec_enc_window low,
+                                unsigned rng) {
+  int d;
+  int c;
+  int s;
+  if (enc->error) return;
+  c = enc->cnt;
+  assert(rng <= 65535U);
+  /*The number of leading zeros in the 16-bit binary representation of rng.*/
+  d = 16 - OD_ILOG_NZ(rng);
+  s = c + d;
+
+  /* We flush every time "low" cannot safely and efficiently accommodate any
+     more data. Overall, c must not exceed 63 at the time of byte flush out. To
+     facilitate this, "s" cannot exceed 56-bits because we have to keep 1 byte
+     for carry. Also, we need to subtract 16 because we want to keep room for
+     the next symbol worth "d"-bits (max 15). An alternate condition would be if
+     (e < d), where e = number of leading zeros in "low", indicating there is
+     not enough rooom to accommodate "rng" worth of "d"-bits in "low". However,
+     this approach needs additional computations: (i) compute "e", (ii) push
+     the leading 0x00's as a special case.
+  */
+  if (s >= 40) {  // 56 - 16
+    unsigned char *out = enc->buf;
+    uint32_t storage = enc->storage;
+    uint32_t offs = enc->offs;
+    if (offs + 8 > storage) {
+      storage = 2 * storage + 8;
+      out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+      if (out == NULL) {
+        enc->error = -1;
+        return;
+      }
+      enc->buf = out;
+      enc->storage = storage;
+    }
+    // Need to add 1 byte here since enc->cnt always counts 1 byte less
+    // (enc->cnt = -9) to ensure correct operation
+    uint8_t num_bytes_ready = (s >> 3) + 1;
+
+    // Update "c" to contain the number of non-ready bits in "low". Since "low"
+    // has 64-bit capacity, we need to add the (64 - 40) cushion bits and take
+    // off the number of ready bits.
+    c += 24 - (num_bytes_ready << 3);
+
+    // Prepare "output" and update "low"
+    uint64_t output = low >> c;
+    low = low & (((uint64_t)1 << c) - 1);
+
+    // Prepare data and carry mask
+    uint64_t mask = (uint64_t)1 << (num_bytes_ready << 3);
+    uint64_t carry = output & mask;
+
+    mask = mask - 0x01;
+    output = output & mask;
+
+    // Write data in a single operation
+    write_enc_data_to_out_buf(out, offs, output, carry, &enc->offs,
+                              num_bytes_ready);
+
+    // Update state of the encoder: enc->cnt to contain the number of residual
+    // bits
+    s = c + d - 24;
+  }
+  enc->low = low << d;
+  enc->rng = rng << d;
+  enc->cnt = s;
+}
+
+/*Initializes the encoder.
+  size: The initial size of the buffer, in bytes.*/
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) {
+  od_ec_enc_reset(enc);
+  enc->buf = (unsigned char *)malloc(sizeof(*enc->buf) * size);
+  enc->storage = size;
+  if (size > 0 && enc->buf == NULL) {
+    enc->storage = 0;
+    enc->error = -1;
+  }
+}
+
+/*Reinitializes the encoder.*/
+void od_ec_enc_reset(od_ec_enc *enc) {
+  enc->offs = 0;
+  enc->low = 0;
+  enc->rng = 0x8000;
+  /*This is initialized to -9 so that it crosses zero after we've accumulated
+     one byte + one carry bit.*/
+  enc->cnt = -9;
+  enc->error = 0;
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy = 0;
+  enc->nb_symbols = 0;
+#endif
+}
+
+/*Frees the buffers used by the encoder.*/
+void od_ec_enc_clear(od_ec_enc *enc) { free(enc->buf); }
+
+/*Encodes a symbol given its frequency in Q15.
+  fl: CDF_PROB_TOP minus the cumulative frequency of all symbols that come
+  before the one to be encoded.
+  fh: CDF_PROB_TOP minus the cumulative frequency of all symbols up to and
+  including the one to be encoded.*/
+static void od_ec_encode_q15(od_ec_enc *enc, unsigned fl, unsigned fh, int s,
+                             int nsyms) {
+  od_ec_enc_window l;
+  unsigned r;
+  unsigned u;
+  unsigned v;
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  assert(fh <= fl);
+  assert(fl <= 32768U);
+  assert(7 - EC_PROB_SHIFT >= 0);
+  const int N = nsyms - 1;
+  if (fl < CDF_PROB_TOP) {
+    u = ((r >> 8) * (uint32_t)(fl >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+        EC_MIN_PROB * (N - (s - 1));
+    v = ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+        EC_MIN_PROB * (N - (s + 0));
+    l += r - u;
+    r = u - v;
+  } else {
+    r -= ((r >> 8) * (uint32_t)(fh >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT)) +
+         EC_MIN_PROB * (N - (s + 0));
+  }
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(OD_ICDF(fh) - OD_ICDF(fl)) / CDF_PROB_TOP.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encode a single binary value.
+  val: The value to encode (0 or 1).
+  f: The probability that the val is one, scaled by 32768.*/
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f) {
+  od_ec_enc_window l;
+  unsigned r;
+  unsigned v;
+  assert(0 < f);
+  assert(f < 32768U);
+  l = enc->low;
+  r = enc->rng;
+  assert(32768U <= r);
+  v = ((r >> 8) * (uint32_t)(f >> EC_PROB_SHIFT) >> (7 - EC_PROB_SHIFT));
+  v += EC_MIN_PROB;
+  if (val) l += r - v;
+  r = val ? v : r - v;
+  od_ec_enc_normalize(enc, l, r);
+#if OD_MEASURE_EC_OVERHEAD
+  enc->entropy -= OD_LOG2((double)(val ? f : (32768 - f)) / 32768.);
+  enc->nb_symbols++;
+#endif
+}
+
+/*Encodes a symbol given a cumulative distribution function (CDF) table in Q15.
+  s: The index of the symbol to encode.
+  icdf: 32768 minus the CDF, such that symbol s falls in the range
+         [s > 0 ? (32768 - icdf[s - 1]) : 0, 32768 - icdf[s]).
+        The values must be monotonically decreasing, and icdf[nsyms - 1] must
+         be 0.
+  nsyms: The number of symbols in the alphabet.
+         This should be at most 16.*/
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *icdf,
+                          int nsyms) {
+  (void)nsyms;
+  assert(s >= 0);
+  assert(s < nsyms);
+  assert(icdf[nsyms - 1] == OD_ICDF(CDF_PROB_TOP));
+  od_ec_encode_q15(enc, s > 0 ? icdf[s - 1] : OD_ICDF(0), icdf[s], s, nsyms);
+}
+
+/*Overwrites a few bits at the very start of an existing stream, after they
+   have already been encoded.
+  This makes it possible to have a few flags up front, where it is easy for
+   decoders to access them without parsing the whole stream, even if their
+   values are not determined until late in the encoding process, without having
+   to buffer all the intermediate symbols in the encoder.
+  In order for this to work, at least nbits bits must have already been encoded
+   using probabilities that are an exact power of two.
+  The encoder can verify the number of encoded bits is sufficient, but cannot
+   check this latter condition.
+  val: The bits to encode (in the least nbits significant bits).
+       They will be decoded in order from most-significant to least.
+  nbits: The number of bits to overwrite.
+         This must be no more than 8.*/
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits) {
+  int shift;
+  unsigned mask;
+  assert(nbits >= 0);
+  assert(nbits <= 8);
+  assert(val < 1U << nbits);
+  shift = 8 - nbits;
+  mask = ((1U << nbits) - 1) << shift;
+  if (enc->offs > 0) {
+    /*The first byte has been finalized.*/
+    enc->buf[0] = (unsigned char)((enc->buf[0] & ~mask) | val << shift);
+  } else if (9 + enc->cnt + (enc->rng == 0x8000) > nbits) {
+    /*The first byte has yet to be output.*/
+    enc->low = (enc->low & ~((od_ec_enc_window)mask << (16 + enc->cnt))) |
+               (od_ec_enc_window)val << (16 + enc->cnt + shift);
+  } else {
+    /*The encoder hasn't even encoded _nbits of data yet.*/
+    enc->error = -1;
+  }
+}
+
+#if OD_MEASURE_EC_OVERHEAD
+#include <stdio.h>
+#endif
+
+/*Indicates that there are no more symbols to encode.
+  All remaining output bytes are flushed to the output buffer.
+  od_ec_enc_reset() should be called before using the encoder again.
+  bytes: Returns the size of the encoded data in the returned buffer.
+  Return: A pointer to the start of the final buffer, or NULL if there was an
+           encoding error.*/
+unsigned char *od_ec_enc_done(od_ec_enc *enc, uint32_t *nbytes) {
+  unsigned char *out;
+  uint32_t storage;
+  uint32_t offs;
+  od_ec_enc_window m;
+  od_ec_enc_window e;
+  od_ec_enc_window l;
+  int c;
+  int s;
+  if (enc->error) return NULL;
+#if OD_MEASURE_EC_OVERHEAD
+  {
+    uint32_t tell;
+    /* Don't count the 1 bit we lose to raw bits as overhead. */
+    tell = od_ec_enc_tell(enc) - 1;
+    fprintf(stderr, "overhead: %f%%\n",
+            100 * (tell - enc->entropy) / enc->entropy);
+    fprintf(stderr, "efficiency: %f bits/symbol\n",
+            (double)tell / enc->nb_symbols);
+  }
+#endif
+
+  l = enc->low;
+  c = enc->cnt;
+  s = 10;
+  m = 0x3FFF;
+  e = ((l + m) & ~m) | (m + 1);
+  s += c;
+  offs = enc->offs;
+
+  /*Make sure there's enough room for the entropy-coded bits.*/
+  out = enc->buf;
+  storage = enc->storage;
+  const int s_bits = (s + 7) >> 3;
+  int b = OD_MAXI(s_bits, 0);
+  if (offs + b > storage) {
+    storage = offs + b;
+    out = (unsigned char *)realloc(out, sizeof(*out) * storage);
+    if (out == NULL) {
+      enc->error = -1;
+      return NULL;
+    }
+    enc->buf = out;
+    enc->storage = storage;
+  }
+
+  /*We output the minimum number of bits that ensures that the symbols encoded
+     thus far will be decoded correctly regardless of the bits that follow.*/
+  if (s > 0) {
+    uint64_t n;
+    n = ((uint64_t)1 << (c + 16)) - 1;
+    do {
+      assert(offs < storage);
+      uint16_t val = (uint16_t)(e >> (c + 16));
+      out[offs] = (unsigned char)(val & 0x00FF);
+      if (val & 0x0100) {
+        assert(offs > 0);
+        propagate_carry_bwd(out, offs - 1);
+      }
+      offs++;
+
+      e &= n;
+      s -= 8;
+      c -= 8;
+      n >>= 8;
+    } while (s > 0);
+  }
+  *nbytes = offs;
+
+  return out;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+int od_ec_enc_tell(const od_ec_enc *enc) {
+  /*The 10 here counteracts the offset of -9 baked into cnt, and adds 1 extra
+     bit, which we reserve for terminating the stream.*/
+  return (enc->cnt + 10) + enc->offs * 8;
+}
+
+/*Returns the number of bits "used" by the encoded symbols so far.
+  This same number can be computed in either the encoder or the decoder, and is
+   suitable for making coding decisions.
+  Warning: The value returned by this function can decrease compared to an
+   earlier call, even after encoding more data, if there is an encoding error
+   (i.e., a failure to allocate enough space for the output buffer).
+  Return: The number of bits scaled by 2**OD_BITRES.
+          This will always be slightly larger than the exact value (e.g., all
+           rounding error is in the positive direction).*/
+uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) {
+  return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng);
+}
diff --git a/third_party/aom/aom_dsp/entenc.h b/third_party/aom/aom_dsp/entenc.h
new file mode 100644
index 0000000000..1a38affb4f
--- /dev/null
+++ b/third_party/aom/aom_dsp/entenc.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_ENTENC_H_
+#define AOM_AOM_DSP_ENTENC_H_
+#include <stddef.h>
+#include "aom_dsp/entcode.h"
+#include "aom_util/endian_inl.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint64_t od_ec_enc_window;
+
+typedef struct od_ec_enc od_ec_enc;
+
+#define OD_MEASURE_EC_OVERHEAD (0)
+
+/*The entropy encoder context.*/
+struct od_ec_enc {
+  /*Buffered output.
+    This contains only the raw bits until the final call to od_ec_enc_done(),
+     where all the arithmetic-coded data gets prepended to it.*/
+  unsigned char *buf;
+  /*The size of the buffer.*/
+  uint32_t storage;
+  /*The offset at which the next entropy-coded byte will be written.*/
+  uint32_t offs;
+  /*The low end of the current range.*/
+  od_ec_enc_window low;
+  /*The number of values in the current range.*/
+  uint16_t rng;
+  /*The number of bits of data in the current value.*/
+  int16_t cnt;
+  /*Nonzero if an error occurred.*/
+  int error;
+#if OD_MEASURE_EC_OVERHEAD
+  double entropy;
+  int nb_symbols;
+#endif
+};
+
+/*See entenc.c for further documentation.*/
+
+void od_ec_enc_init(od_ec_enc *enc, uint32_t size) OD_ARG_NONNULL(1);
+void od_ec_enc_reset(od_ec_enc *enc) OD_ARG_NONNULL(1);
+void od_ec_enc_clear(od_ec_enc *enc) OD_ARG_NONNULL(1);
+
+void od_ec_encode_bool_q15(od_ec_enc *enc, int val, unsigned f_q15)
+    OD_ARG_NONNULL(1);
+void od_ec_encode_cdf_q15(od_ec_enc *enc, int s, const uint16_t *cdf, int nsyms)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(3);
+
+void od_ec_enc_bits(od_ec_enc *enc, uint32_t fl, unsigned ftb)
+    OD_ARG_NONNULL(1);
+
+void od_ec_enc_patch_initial_bits(od_ec_enc *enc, unsigned val, int nbits)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT unsigned char *od_ec_enc_done(od_ec_enc *enc,
+                                                    uint32_t *nbytes)
+    OD_ARG_NONNULL(1) OD_ARG_NONNULL(2);
+
+OD_WARN_UNUSED_RESULT int od_ec_enc_tell(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc)
+    OD_ARG_NONNULL(1);
+
+// buf is the frame bitbuffer, offs is where carry to be added
+static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) {
+  uint16_t sum, carry = 1;
+  do {
+    sum = (uint16_t)buf[offs] + 1;
+    buf[offs--] = (unsigned char)sum;
+    carry = sum >> 8;
+  } while (carry);
+}
+
+// Convert to big-endian byte order and write data to buffer adding the
+// carry-bit
+static AOM_INLINE void write_enc_data_to_out_buf(unsigned char *out,
+                                                 uint32_t offs, uint64_t output,
+                                                 uint64_t carry,
+                                                 uint32_t *enc_offs,
+                                                 uint8_t num_bytes_ready) {
+  const uint64_t reg = HToBE64(output << ((8 - num_bytes_ready) << 3));
+  memcpy(&out[offs], &reg, 8);
+  // Propagate carry backwards if exists
+  if (carry) {
+    assert(offs > 0);
+    propagate_carry_bwd(out, offs - 1);
+  }
+  *enc_offs = offs + num_bytes_ready;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ENTENC_H_
diff --git a/third_party/aom/aom_dsp/fastssim.c b/third_party/aom/aom_dsp/fastssim.c
new file mode 100644
index 0000000000..0ef0590e89
--- /dev/null
+++ b/third_party/aom/aom_dsp/fastssim.c
@@ -0,0 +1,488 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Nathan E. Egge, at the Daala
+ *  project.
+ */
+#include <assert.h>
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+
+typedef struct fs_level fs_level;
+typedef struct fs_ctx fs_ctx;
+
+#define SSIM_C1 (255 * 255 * 0.01 * 0.01)
+#define SSIM_C2 (255 * 255 * 0.03 * 0.03)
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#define MAX_SSIM_DB 100.0
+
+#define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
+#define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
+
+struct fs_level {
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  int w;
+  int h;
+};
+
+struct fs_ctx {
+  fs_level *level;
+  int nlevels;
+  unsigned *col_buf;
+};
+
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+  unsigned char *data;
+  size_t data_size;
+  int lw;
+  int lh;
+  int l;
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  data_size =
+      _nlevels * sizeof(fs_level) + 2 * (lw + 8) * 8 * sizeof(*_ctx->col_buf);
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size += im_size;
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    data_size += level_size;
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
+  _ctx->level = (fs_level *)data;
+  _ctx->nlevels = _nlevels;
+  data += _nlevels * sizeof(*_ctx->level);
+  lw = (_w + 1) >> 1;
+  lh = (_h + 1) >> 1;
+  for (l = 0; l < _nlevels; l++) {
+    size_t im_size;
+    size_t level_size;
+    _ctx->level[l].w = lw;
+    _ctx->level[l].h = lh;
+    im_size = lw * (size_t)lh;
+    level_size = 2 * im_size * sizeof(*_ctx->level[l].im1);
+    level_size += sizeof(*_ctx->level[l].ssim) - 1;
+    level_size /= sizeof(*_ctx->level[l].ssim);
+    level_size *= sizeof(*_ctx->level[l].ssim);
+    _ctx->level[l].im1 = (uint32_t *)data;
+    _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
+    data += level_size;
+    _ctx->level[l].ssim = (double *)data;
+    data += im_size * sizeof(*_ctx->level[l].ssim);
+    lw = (lw + 1) >> 1;
+    lh = (lh + 1) >> 1;
+  }
+  _ctx->col_buf = (unsigned *)data;
+  return 0;
+}
+
+static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
+
+static void fs_downsample_level(fs_ctx *_ctx, int _l) {
+  const uint32_t *src1;
+  const uint32_t *src2;
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w2;
+  int h2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  dst1 = _ctx->level[_l].im1;
+  dst2 = _ctx->level[_l].im2;
+  w2 = _ctx->level[_l - 1].w;
+  h2 = _ctx->level[_l - 1].h;
+  src1 = _ctx->level[_l - 1].im1;
+  src2 = _ctx->level[_l - 1].im2;
+  for (j = 0; j < h; j++) {
+    int j0offs;
+    int j1offs;
+    j0offs = 2 * j * w2;
+    j1offs = FS_MINI(2 * j + 1, h2) * w2;
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, w2);
+      dst1[j * w + i] = src1[j0offs + i0] + src1[j0offs + i1] +
+                        src1[j1offs + i0] + src1[j1offs + i1];
+      dst2[j * w + i] = src2[j0offs + i0] + src2[j0offs + i1] +
+                        src2[j1offs + i0] + src2[j1offs + i1];
+    }
+  }
+}
+
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+                                 int _s1ystride, const uint8_t *_src2,
+                                 int _s2ystride, int _w, int _h, uint32_t shift,
+                                 int buf_is_hbd) {
+  uint32_t *dst1;
+  uint32_t *dst2;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[0].w;
+  h = _ctx->level[0].h;
+  dst1 = _ctx->level[0].im1;
+  dst2 = _ctx->level[0].im2;
+  for (j = 0; j < h; j++) {
+    int j0;
+    int j1;
+    j0 = 2 * j;
+    j1 = FS_MINI(j0 + 1, _h);
+    for (i = 0; i < w; i++) {
+      int i0;
+      int i1;
+      i0 = 2 * i;
+      i1 = FS_MINI(i0 + 1, _w);
+      if (!buf_is_hbd) {
+        dst1[j * w + i] =
+            _src1[j0 * _s1ystride + i0] + _src1[j0 * _s1ystride + i1] +
+            _src1[j1 * _s1ystride + i0] + _src1[j1 * _s1ystride + i1];
+        dst2[j * w + i] =
+            _src2[j0 * _s2ystride + i0] + _src2[j0 * _s2ystride + i1] +
+            _src2[j1 * _s2ystride + i0] + _src2[j1 * _s2ystride + i1];
+      } else {
+        uint16_t *src1s = CONVERT_TO_SHORTPTR(_src1);
+        uint16_t *src2s = CONVERT_TO_SHORTPTR(_src2);
+        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift) +
+                          (src1s[j0 * _s1ystride + i1] >> shift) +
+                          (src1s[j1 * _s1ystride + i0] >> shift) +
+                          (src1s[j1 * _s1ystride + i1] >> shift);
+        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift) +
+                          (src2s[j0 * _s2ystride + i1] >> shift) +
+                          (src2s[j1 * _s2ystride + i0] >> shift) +
+                          (src2s[j1 * _s2ystride + i1] >> shift);
+      }
+    }
+  }
+}
+
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
+  unsigned *col_sums_x;
+  unsigned *col_sums_y;
+  uint32_t *im1;
+  uint32_t *im2;
+  double *ssim;
+  double c1;
+  int w;
+  int h;
+  int j0offs;
+  int j1offs;
+  int i;
+  int j;
+  double ssim_c1 = SSIM_C1;
+
+  if (bit_depth == 10) ssim_c1 = SSIM_C1_10;
+  if (bit_depth == 12) ssim_c1 = SSIM_C1_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  col_sums_x = _ctx->col_buf;
+  col_sums_y = col_sums_x + w;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  for (i = 0; i < w; i++) col_sums_x[i] = 5 * im1[i];
+  for (i = 0; i < w; i++) col_sums_y[i] = 5 * im2[i];
+  for (j = 1; j < 4; j++) {
+    j1offs = FS_MINI(j, h - 1) * w;
+    for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+    for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+  }
+  ssim = _ctx->level[_l].ssim;
+  c1 = (double)(ssim_c1 * 4096 * (1 << 4 * _l));
+  for (j = 0; j < h; j++) {
+    unsigned mux;
+    unsigned muy;
+    int i0;
+    int i1;
+    mux = 5 * col_sums_x[0];
+    muy = 5 * col_sums_y[0];
+    for (i = 1; i < 4; i++) {
+      i1 = FS_MINI(i, w - 1);
+      mux += col_sums_x[i1];
+      muy += col_sums_y[i1];
+    }
+    for (i = 0; i < w; i++) {
+      ssim[j * w + i] *= (2 * mux * (double)muy + c1) /
+                         (mux * (double)mux + muy * (double)muy + c1);
+      if (i + 1 < w) {
+        i0 = FS_MAXI(0, i - 4);
+        i1 = FS_MINI(i + 4, w - 1);
+        mux += col_sums_x[i1] - col_sums_x[i0];
+        muy += col_sums_x[i1] - col_sums_x[i0];
+      }
+    }
+    if (j + 1 < h) {
+      j0offs = FS_MAXI(0, j - 4) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] -= im1[j0offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] -= im2[j0offs + i];
+      j1offs = FS_MINI(j + 4, h - 1) * w;
+      for (i = 0; i < w; i++) col_sums_x[i] += im1[j1offs + i];
+      for (i = 0; i < w; i++) col_sums_y[i] += im2[j1offs + i];
+    }
+  }
+}
+
+#define FS_COL_SET(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] = gx * (double)gx;                    \
+    col_sums_gy2[(_col)] = gy * (double)gy;                    \
+    col_sums_gxgy[(_col)] = gx * (double)gy;                   \
+  } while (0)
+
+#define FS_COL_ADD(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] += gx * (double)gx;                   \
+    col_sums_gy2[(_col)] += gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] += gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_SUB(_col, _joffs, _ioffs)                       \
+  do {                                                         \
+    unsigned gx;                                               \
+    unsigned gy;                                               \
+    gx = gx_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    gy = gy_buf[((j + (_joffs)) & 7) * stride + i + (_ioffs)]; \
+    col_sums_gx2[(_col)] -= gx * (double)gx;                   \
+    col_sums_gy2[(_col)] -= gy * (double)gy;                   \
+    col_sums_gxgy[(_col)] -= gx * (double)gy;                  \
+  } while (0)
+
+#define FS_COL_COPY(_col1, _col2)                    \
+  do {                                               \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)];   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)];   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)]; \
+  } while (0)
+
+#define FS_COL_HALVE(_col1, _col2)                         \
+  do {                                                     \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 0.5;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 0.5;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 0.5; \
+  } while (0)
+
+#define FS_COL_DOUBLE(_col1, _col2)                      \
+  do {                                                   \
+    col_sums_gx2[(_col1)] = col_sums_gx2[(_col2)] * 2;   \
+    col_sums_gy2[(_col1)] = col_sums_gy2[(_col2)] * 2;   \
+    col_sums_gxgy[(_col1)] = col_sums_gxgy[(_col2)] * 2; \
+  } while (0)
+
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+  uint32_t *im1;
+  uint32_t *im2;
+  unsigned *gx_buf;
+  unsigned *gy_buf;
+  double *ssim;
+  double col_sums_gx2[8];
+  double col_sums_gy2[8];
+  double col_sums_gxgy[8];
+  double c2;
+  int stride;
+  int w;
+  int h;
+  int i;
+  int j;
+  double ssim_c2 = SSIM_C2;
+  if (bit_depth == 10) ssim_c2 = SSIM_C2_10;
+  if (bit_depth == 12) ssim_c2 = SSIM_C2_12;
+
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  im1 = _ctx->level[_l].im1;
+  im2 = _ctx->level[_l].im2;
+  ssim = _ctx->level[_l].ssim;
+  gx_buf = _ctx->col_buf;
+  stride = w + 8;
+  gy_buf = gx_buf + 8 * stride;
+  memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
+  c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
+  for (j = 0; j < h + 4; j++) {
+    if (j < h - 1) {
+      for (i = 0; i < w - 1; i++) {
+        unsigned g1;
+        unsigned g2;
+        unsigned gx;
+        unsigned gy;
+        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
+        gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
+        gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
+        gx_buf[(j & 7) * stride + i + 4] = gx;
+        gy_buf[(j & 7) * stride + i + 4] = gy;
+      }
+    } else {
+      memset(gx_buf + (j & 7) * stride, 0, stride * sizeof(*gx_buf));
+      memset(gy_buf + (j & 7) * stride, 0, stride * sizeof(*gy_buf));
+    }
+    if (j >= 4) {
+      int k;
+      col_sums_gx2[3] = col_sums_gx2[2] = col_sums_gx2[1] = col_sums_gx2[0] = 0;
+      col_sums_gy2[3] = col_sums_gy2[2] = col_sums_gy2[1] = col_sums_gy2[0] = 0;
+      col_sums_gxgy[3] = col_sums_gxgy[2] = col_sums_gxgy[1] =
+          col_sums_gxgy[0] = 0;
+      for (i = 4; i < 8; i++) {
+        FS_COL_SET(i, -1, 0);
+        FS_COL_ADD(i, 0, 0);
+        for (k = 1; k < 8 - i; k++) {
+          FS_COL_DOUBLE(i, i);
+          FS_COL_ADD(i, -k - 1, 0);
+          FS_COL_ADD(i, k, 0);
+        }
+      }
+      for (i = 0; i < w; i++) {
+        double mugx2;
+        double mugy2;
+        double mugxgy;
+        mugx2 = col_sums_gx2[0];
+        for (k = 1; k < 8; k++) mugx2 += col_sums_gx2[k];
+        mugy2 = col_sums_gy2[0];
+        for (k = 1; k < 8; k++) mugy2 += col_sums_gy2[k];
+        mugxgy = col_sums_gxgy[0];
+        for (k = 1; k < 8; k++) mugxgy += col_sums_gxgy[k];
+        ssim[(j - 4) * w + i] = (2 * mugxgy + c2) / (mugx2 + mugy2 + c2);
+        if (i + 1 < w) {
+          FS_COL_SET(0, -1, 1);
+          FS_COL_ADD(0, 0, 1);
+          FS_COL_SUB(2, -3, 2);
+          FS_COL_SUB(2, 2, 2);
+          FS_COL_HALVE(1, 2);
+          FS_COL_SUB(3, -4, 3);
+          FS_COL_SUB(3, 3, 3);
+          FS_COL_HALVE(2, 3);
+          FS_COL_COPY(3, 4);
+          FS_COL_DOUBLE(4, 5);
+          FS_COL_ADD(4, -4, 5);
+          FS_COL_ADD(4, 3, 5);
+          FS_COL_DOUBLE(5, 6);
+          FS_COL_ADD(5, -3, 6);
+          FS_COL_ADD(5, 2, 6);
+          FS_COL_DOUBLE(6, 7);
+          FS_COL_ADD(6, -2, 7);
+          FS_COL_ADD(6, 1, 7);
+          FS_COL_SET(7, -1, 8);
+          FS_COL_ADD(7, 0, 8);
+        }
+      }
+    }
+  }
+}
+
+#define FS_NLEVELS (4)
+
+/*These weights were derived from the default weights found in Wang's original
+ Matlab implementation: {0.0448, 0.2856, 0.2363, 0.1333}.
+ We drop the finest scale and renormalize the rest to sum to 1.*/
+
+static const double FS_WEIGHTS[FS_NLEVELS] = {
+  0.2989654541015625, 0.3141326904296875, 0.2473602294921875, 0.1395416259765625
+};
+
+static double fs_average(fs_ctx *_ctx, int _l) {
+  double *ssim;
+  double ret;
+  int w;
+  int h;
+  int i;
+  int j;
+  w = _ctx->level[_l].w;
+  h = _ctx->level[_l].h;
+  ssim = _ctx->level[_l].ssim;
+  ret = 0;
+  for (j = 0; j < h; j++)
+    for (i = 0; i < w; i++) ret += ssim[j * w + i];
+  return pow(ret / (w * h), FS_WEIGHTS[_l]);
+}
+
+static double convert_ssim_db(double _ssim, double _weight) {
+  assert(_weight >= _ssim);
+  if ((_weight - _ssim) < 1e-10) return MAX_SSIM_DB;
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride, const uint8_t *_dst,
+                        int _dystride, int _w, int _h, uint32_t _bd,
+                        uint32_t _shift, int buf_is_hbd) {
+  fs_ctx ctx;
+  double ret;
+  int l;
+  ret = 1;
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _shift,
+                       buf_is_hbd);
+  for (l = 0; l < FS_NLEVELS - 1; l++) {
+    fs_calc_structure(&ctx, l, _bd);
+    ret *= fs_average(&ctx, l);
+    fs_downsample_level(&ctx, l + 1);
+  }
+  fs_calc_structure(&ctx, l, _bd);
+  fs_apply_luminance(&ctx, l, _bd);
+  ret *= fs_average(&ctx, l);
+  fs_ctx_clear(&ctx);
+  return ret;
+}
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd) {
+  double ssimv;
+  uint32_t bd_shift = 0;
+  assert(bd >= in_bd);
+  assert(source->flags == dest->flags);
+  int buf_is_hbd = source->flags & YV12_FLAG_HIGHBITDEPTH;
+  bd_shift = bd - in_bd;
+
+  *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
+                      dest->y_stride, source->y_crop_width,
+                      source->y_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
+                      dest->uv_stride, source->uv_crop_width,
+                      source->uv_crop_height, in_bd, bd_shift, buf_is_hbd);
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+  return convert_ssim_db(ssimv, 1.0);
+}
diff --git a/third_party/aom/aom_dsp/fft.c b/third_party/aom/aom_dsp/fft.c
new file mode 100644
index 0000000000..a44dbf77b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void simple_transpose(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y++) {
+    for (int x = 0; x < n; x++) {
+      B[y * n + x] = A[x * n + y];
+    }
+  }
+}
+
+// The 1d transform is real to complex and packs the complex results in
+// a way to take advantage of conjugate symmetry (e.g., the n/2 + 1 real
+// components, followed by the n/2 - 1 imaginary components). After the
+// transform is done on the rows, the first n/2 + 1 columns are real, and
+// the remaining are the imaginary components. After the transform on the
+// columns, the region of [0, n/2]x[0, n/2] contains the real part of
+// fft of the real columns. The real part of the 2d fft also includes the
+// imaginary part of transformed imaginary columns. This function assembles
+// the correct outputs while putting the real and imaginary components
+// next to each other.
+static INLINE void unpack_2d_output(const float *col_fft, float *output,
+                                    int n) {
+  for (int y = 0; y <= n / 2; ++y) {
+    const int y2 = y + n / 2;
+    const int y_extra = y2 > n / 2 && y2 < n;
+
+    for (int x = 0; x <= n / 2; ++x) {
+      const int x2 = x + n / 2;
+      const int x_extra = x2 > n / 2 && x2 < n;
+      output[2 * (y * n + x)] =
+          col_fft[y * n + x] - (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+      output[2 * (y * n + x) + 1] = (y_extra ? col_fft[y2 * n + x] : 0) +
+                                    (x_extra ? col_fft[y * n + x2] : 0);
+      if (y_extra) {
+        output[2 * ((n - y) * n + x)] =
+            col_fft[y * n + x] +
+            (x_extra && y_extra ? col_fft[y2 * n + x2] : 0);
+        output[2 * ((n - y) * n + x) + 1] =
+            -(y_extra ? col_fft[y2 * n + x] : 0) +
+            (x_extra ? col_fft[y * n + x2] : 0);
+      }
+    }
+  }
+}
+
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size) {
+  for (int x = 0; x < n; x += vec_size) {
+    tform(input + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  for (int x = 0; x < n; x += vec_size) {
+    tform(temp + x, output + x, n);
+  }
+  transpose(output, temp, n);
+
+  unpack(temp, output, n);
+}
+
+static INLINE void store_float(float *output, float input) { *output = input; }
+static INLINE float add_float(float a, float b) { return a + b; }
+static INLINE float sub_float(float a, float b) { return a - b; }
+static INLINE float mul_float(float a, float b) { return a * b; }
+
+GEN_FFT_2(void, float, float, float, *, store_float)
+GEN_FFT_4(void, float, float, float, *, store_float, (float), add_float,
+          sub_float)
+GEN_FFT_8(void, float, float, float, *, store_float, (float), add_float,
+          sub_float, mul_float)
+GEN_FFT_16(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float)
+GEN_FFT_32(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float)
+
+void aom_fft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_fft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_float, simple_transpose,
+                 unpack_2d_output, 1);
+}
+
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size) {
+  // Column 0 and n/2 have conjugate symmetry, so we can directly do the ifft
+  // and get real outputs.
+  for (int y = 0; y <= n / 2; ++y) {
+    output[y * n] = input[2 * y * n];
+    output[y * n + 1] = input[2 * (y * n + n / 2)];
+  }
+  for (int y = n / 2 + 1; y < n; ++y) {
+    output[y * n] = input[2 * (y - n / 2) * n + 1];
+    output[y * n + 1] = input[2 * ((y - n / 2) * n + n / 2) + 1];
+  }
+
+  for (int i = 0; i < 2; i += vec_size) {
+    ifft_multi(output + i, temp + i, n);
+  }
+
+  // For the other columns, since we don't have a full ifft for complex inputs
+  // we have to split them into the real and imaginary counterparts.
+  // Pack the real component, then the imaginary components.
+  for (int y = 0; y < n; ++y) {
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + 1)] = input[2 * (y * n + x)];
+    }
+    for (int x = 1; x < n / 2; ++x) {
+      output[y * n + (x + n / 2)] = input[2 * (y * n + x) + 1];
+    }
+  }
+  for (int y = 2; y < vec_size; y++) {
+    fft_single(output + y, temp + y, n);
+  }
+  // This is the part that can be sped up with SIMD
+  for (int y = AOMMAX(2, vec_size); y < n; y += vec_size) {
+    fft_multi(output + y, temp + y, n);
+  }
+
+  // Put the 0 and n/2 th results in the correct place.
+  for (int x = 0; x < n; ++x) {
+    output[x] = temp[x * n];
+    output[(n / 2) * n + x] = temp[x * n + 1];
+  }
+  // This rearranges and transposes.
+  for (int y = 1; y < n / 2; ++y) {
+    // Fill in the real columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + y * n] =
+          temp[(y + 1) + x * n] +
+          ((x > 0 && x < n / 2) ? temp[(y + n / 2) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + y * n] = temp[(y + 1) + (n - x) * n] -
+                          temp[(y + n / 2) + ((n - x) + n / 2) * n];
+    }
+    // Fill in the imag columns
+    for (int x = 0; x <= n / 2; ++x) {
+      output[x + (y + n / 2) * n] =
+          temp[(y + n / 2) + x * n] -
+          ((x > 0 && x < n / 2) ? temp[(y + 1) + (x + n / 2) * n] : 0);
+    }
+    for (int x = n / 2 + 1; x < n; ++x) {
+      output[x + (y + n / 2) * n] = temp[(y + 1) + ((n - x) + n / 2) * n] +
+                                    temp[(y + n / 2) + (n - x) * n];
+    }
+  }
+  for (int y = 0; y < n; y += vec_size) {
+    ifft_multi(output + y, temp + y, n);
+  }
+  transpose(temp, output, n);
+}
+
+GEN_IFFT_2(void, float, float, float, *, store_float)
+GEN_IFFT_4(void, float, float, float, *, store_float, (float), add_float,
+           sub_float)
+GEN_IFFT_8(void, float, float, float, *, store_float, (float), add_float,
+           sub_float, mul_float)
+GEN_IFFT_16(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float)
+GEN_IFFT_32(void, float, float, float, *, store_float, (float), add_float,
+            sub_float, mul_float)
+
+void aom_ifft2x2_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 2, aom_fft1d_2_float, aom_fft1d_2_float,
+                  aom_ifft1d_2_float, simple_transpose, 1);
+}
+
+void aom_ifft4x4_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_float,
+                  aom_ifft1d_4_float, simple_transpose, 1);
+}
+
+void aom_ifft8x8_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_float,
+                  aom_ifft1d_8_float, simple_transpose, 1);
+}
+
+void aom_ifft16x16_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_float, aom_ifft1d_16_float, simple_transpose, 1);
+}
+
+void aom_ifft32x32_float_c(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_float, aom_ifft1d_32_float, simple_transpose, 1);
+}
diff --git a/third_party/aom/aom_dsp/fft_common.h b/third_party/aom/aom_dsp/fft_common.h
new file mode 100644
index 0000000000..3de1a045ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/fft_common.h
@@ -0,0 +1,1056 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FFT_COMMON_H_
+#define AOM_AOM_DSP_FFT_COMMON_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief A function pointer for computing 1d fft and ifft.
+ *
+ * The function will point to an implementation for a specific transform size,
+ * and may perform the transforms using vectorized instructions.
+ *
+ * For a non-vectorized forward transforms of size n, the input and output
+ * buffers will be size n. The output takes advantage of conjugate symmetry and
+ * packs the results as: [r_0, r_1, ..., r_{n/2}, i_1, ..., i_{n/2-1}], where
+ * (r_{j}, i_{j}) is the complex output for index j.
+ *
+ * An inverse transform will assume that the complex "input" is packed
+ * similarly. Its output will be real.
+ *
+ * Non-vectorized transforms (e.g., on a single row) would use a stride = 1.
+ *
+ * Vectorized implementations are parallelized along the columns so that the fft
+ * can be performed on multiple columns at a time. In such cases the data block
+ * for input and output is typically square (n x n) and the stride will
+ * correspond to the spacing between rows. At minimum, the input size must be
+ * n x simd_vector_length.
+ *
+ * \param[in]  input   Input buffer. See above for size restrictions.
+ * \param[out] output  Output buffer. See above for size restrictions.
+ * \param[in]  stride  The spacing in number of elements between rows
+ *                     (or elements)
+ */
+typedef void (*aom_fft_1d_func_t)(const float *input, float *output,
+                                  int stride);
+
+// Declare some of the forward non-vectorized transforms which are used in some
+// of the vectorized implementations
+void aom_fft1d_2_float(const float *input, float *output, int stride);
+void aom_fft1d_4_float(const float *input, float *output, int stride);
+void aom_fft1d_8_float(const float *input, float *output, int stride);
+void aom_fft1d_16_float(const float *input, float *output, int stride);
+void aom_fft1d_32_float(const float *input, float *output, int stride);
+void aom_ifft1d_2_float(const float *input, float *output, int stride);
+void aom_ifft1d_4_float(const float *input, float *output, int stride);
+void aom_ifft1d_8_float(const float *input, float *output, int stride);
+void aom_ifft1d_16_float(const float *input, float *output, int stride);
+void aom_ifft1d_32_float(const float *input, float *output, int stride);
+
+/**\!brief Function pointer for transposing a matrix of floats.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_transpose_func_t)(const float *input, float *output,
+                                         int n);
+
+/**\!brief Function pointer for re-arranging intermediate 2d transform results.
+ *
+ * After re-arrangement, the real and imaginary components will be packed
+ * tightly next to each other.
+ *
+ * \param[in]  input  Input buffer (size n x n)
+ * \param[out] output Output buffer (size 2 x n x n)
+ * \param[in]  n      Extent of one dimension of the square matrix.
+ */
+typedef void (*aom_fft_unpack_func_t)(const float *input, float *output, int n);
+
+/*!\brief Performs a 2d fft with the given functions.
+ *
+ * This generator function allows for multiple different implementations of 2d
+ * fft with different vector operations, without having to redefine the main
+ * body multiple times.
+ *
+ * \param[in]  input     Input buffer to run the transform on (size n x n)
+ * \param[out] temp      Working buffer for computing the transform (size n x n)
+ * \param[out] output    Output buffer (size 2 x n x n)
+ * \param[in]  tform     Forward transform function
+ * \param[in]  transpose Transpose function (for n x n matrix)
+ * \param[in]  unpack    Unpack function used to massage outputs to correct form
+ * \param[in]  vec_size  Vector size (the transform is done vec_size units at
+ *                       a time)
+ */
+void aom_fft_2d_gen(const float *input, float *temp, float *output, int n,
+                    aom_fft_1d_func_t tform, aom_fft_transpose_func_t transpose,
+                    aom_fft_unpack_func_t unpack, int vec_size);
+
+/*!\brief Perform a 2d inverse fft with the given helper functions
+ *
+ * \param[in]  input      Input buffer to run the transform on (size 2 x n x n)
+ * \param[out] temp       Working buffer for computations (size 2 x n x n)
+ * \param[out] output     Output buffer (size n x n)
+ * \param[in]  fft_single Forward transform function (non vectorized)
+ * \param[in]  fft_multi  Forward transform function (vectorized)
+ * \param[in]  ifft_multi Inverse transform function (vectorized)
+ * \param[in]  transpose  Transpose function (for n x n matrix)
+ * \param[in]  vec_size   Vector size (the transform is done vec_size
+ *                        units at a time)
+ */
+void aom_ifft_2d_gen(const float *input, float *temp, float *output, int n,
+                     aom_fft_1d_func_t fft_single, aom_fft_1d_func_t fft_multi,
+                     aom_fft_1d_func_t ifft_multi,
+                     aom_fft_transpose_func_t transpose, int vec_size);
+#ifdef __cplusplus
+}
+#endif
+
+// The macros below define 1D fft/ifft for different data types and for
+// different simd vector intrinsic types.
+
+#define GEN_FFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_fft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                      \
+    const T_VEC i1 = load(input + 1 * stride);                      \
+    store(output + 0 * stride, i0 + i1);                            \
+    store(output + 1 * stride, i0 - i1);                            \
+  }
+
+#define GEN_FFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_fft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                            \
+    const T_VEC i1 = load(input + 1 * stride);                            \
+    const T_VEC i2 = load(input + 2 * stride);                            \
+    const T_VEC i3 = load(input + 3 * stride);                            \
+    const T_VEC w0 = add(i0, i2);                                         \
+    const T_VEC w1 = sub(i0, i2);                                         \
+    const T_VEC w2 = add(i1, i3);                                         \
+    const T_VEC w3 = sub(i1, i3);                                         \
+    store(output + 0 * stride, add(w0, w2));                              \
+    store(output + 1 * stride, w1);                                       \
+    store(output + 2 * stride, sub(w0, w2));                              \
+    store(output + 3 * stride, sub(kWeight0, w3));                        \
+  }
+
+#define GEN_FFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, mul) \
+  ret aom_fft1d_8_##suffix(const T *input, T *output, int stride) {            \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC w0 = add(i0, i4);                                              \
+    const T_VEC w1 = sub(i0, i4);                                              \
+    const T_VEC w2 = add(i2, i6);                                              \
+    const T_VEC w3 = sub(i2, i6);                                              \
+    const T_VEC w4 = add(w0, w2);                                              \
+    const T_VEC w5 = sub(w0, w2);                                              \
+    const T_VEC w7 = add(i1, i5);                                              \
+    const T_VEC w8 = sub(i1, i5);                                              \
+    const T_VEC w9 = add(i3, i7);                                              \
+    const T_VEC w10 = sub(i3, i7);                                             \
+    const T_VEC w11 = add(w7, w9);                                             \
+    const T_VEC w12 = sub(w7, w9);                                             \
+    store(output + 0 * stride, add(w4, w11));                                  \
+    store(output + 1 * stride, add(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 2 * stride, w5);                                            \
+    store(output + 3 * stride, sub(w1, mul(kWeight2, sub(w8, w10))));          \
+    store(output + 4 * stride, sub(w4, w11));                                  \
+    store(output + 5 * stride,                                                 \
+          sub(sub(kWeight0, w3), mul(kWeight2, add(w10, w8))));                \
+    store(output + 6 * stride, sub(kWeight0, w12));                            \
+    store(output + 7 * stride, sub(w3, mul(kWeight2, add(w10, w8))));          \
+  }
+
+#define GEN_FFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_fft1d_16_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC kWeight3 = constant(0.92388f);                             \
+    const T_VEC kWeight4 = constant(0.382683f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC i8 = load(input + 8 * stride);                             \
+    const T_VEC i9 = load(input + 9 * stride);                             \
+    const T_VEC i10 = load(input + 10 * stride);                           \
+    const T_VEC i11 = load(input + 11 * stride);                           \
+    const T_VEC i12 = load(input + 12 * stride);                           \
+    const T_VEC i13 = load(input + 13 * stride);                           \
+    const T_VEC i14 = load(input + 14 * stride);                           \
+    const T_VEC i15 = load(input + 15 * stride);                           \
+    const T_VEC w0 = add(i0, i8);                                          \
+    const T_VEC w1 = sub(i0, i8);                                          \
+    const T_VEC w2 = add(i4, i12);                                         \
+    const T_VEC w3 = sub(i4, i12);                                         \
+    const T_VEC w4 = add(w0, w2);                                          \
+    const T_VEC w5 = sub(w0, w2);                                          \
+    const T_VEC w7 = add(i2, i10);                                         \
+    const T_VEC w8 = sub(i2, i10);                                         \
+    const T_VEC w9 = add(i6, i14);                                         \
+    const T_VEC w10 = sub(i6, i14);                                        \
+    const T_VEC w11 = add(w7, w9);                                         \
+    const T_VEC w12 = sub(w7, w9);                                         \
+    const T_VEC w14 = add(w4, w11);                                        \
+    const T_VEC w15 = sub(w4, w11);                                        \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(sub(kWeight0, w3),                          \
+                               mul(kWeight2, add(w10, w8))) };             \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),           \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };         \
+    const T_VEC w19 = add(i1, i9);                                         \
+    const T_VEC w20 = sub(i1, i9);                                         \
+    const T_VEC w21 = add(i5, i13);                                        \
+    const T_VEC w22 = sub(i5, i13);                                        \
+    const T_VEC w23 = add(w19, w21);                                       \
+    const T_VEC w24 = sub(w19, w21);                                       \
+    const T_VEC w26 = add(i3, i11);                                        \
+    const T_VEC w27 = sub(i3, i11);                                        \
+    const T_VEC w28 = add(i7, i15);                                        \
+    const T_VEC w29 = sub(i7, i15);                                        \
+    const T_VEC w30 = add(w26, w28);                                       \
+    const T_VEC w31 = sub(w26, w28);                                       \
+    const T_VEC w33 = add(w23, w30);                                       \
+    const T_VEC w34 = sub(w23, w30);                                       \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(sub(kWeight0, w22),                         \
+                               mul(kWeight2, add(w29, w27))) };            \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),         \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };       \
+    store(output + 0 * stride, add(w14, w33));                             \
+    store(output + 1 * stride,                                             \
+          add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1])))); \
+    store(output + 2 * stride, add(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 3 * stride,                                             \
+          add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1])))); \
+    store(output + 4 * stride, w15);                                       \
+    store(output + 5 * stride,                                             \
+          add(w18[0], sub(sub(kWeight0, mul(kWeight4, w37[0])),            \
+                          mul(kWeight3, w37[1]))));                        \
+    store(output + 6 * stride, sub(w5, mul(kWeight2, sub(w24, w31))));     \
+    store(output + 7 * stride,                                             \
+          add(w16[0], sub(sub(kWeight0, mul(kWeight3, w35[0])),            \
+                          mul(kWeight4, w35[1]))));                        \
+    store(output + 8 * stride, sub(w14, w33));                             \
+    store(output + 9 * stride,                                             \
+          add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))); \
+    store(output + 10 * stride,                                            \
+          sub(sub(kWeight0, w12), mul(kWeight2, add(w31, w24))));          \
+    store(output + 11 * stride,                                            \
+          add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))); \
+    store(output + 12 * stride, sub(kWeight0, w34));                       \
+    store(output + 13 * stride,                                            \
+          sub(sub(kWeight0, w18[1]),                                       \
+              sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1]))));         \
+    store(output + 14 * stride, sub(w12, mul(kWeight2, add(w31, w24))));   \
+    store(output + 15 * stride,                                            \
+          sub(sub(kWeight0, w16[1]),                                       \
+              sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1]))));         \
+  }
+
+#define GEN_FFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                   mul)                                                      \
+  ret aom_fft1d_32_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                   \
+    const T_VEC kWeight2 = constant(0.707107f);                              \
+    const T_VEC kWeight3 = constant(0.92388f);                               \
+    const T_VEC kWeight4 = constant(0.382683f);                              \
+    const T_VEC kWeight5 = constant(0.980785f);                              \
+    const T_VEC kWeight6 = constant(0.19509f);                               \
+    const T_VEC kWeight7 = constant(0.83147f);                               \
+    const T_VEC kWeight8 = constant(0.55557f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                               \
+    const T_VEC i1 = load(input + 1 * stride);                               \
+    const T_VEC i2 = load(input + 2 * stride);                               \
+    const T_VEC i3 = load(input + 3 * stride);                               \
+    const T_VEC i4 = load(input + 4 * stride);                               \
+    const T_VEC i5 = load(input + 5 * stride);                               \
+    const T_VEC i6 = load(input + 6 * stride);                               \
+    const T_VEC i7 = load(input + 7 * stride);                               \
+    const T_VEC i8 = load(input + 8 * stride);                               \
+    const T_VEC i9 = load(input + 9 * stride);                               \
+    const T_VEC i10 = load(input + 10 * stride);                             \
+    const T_VEC i11 = load(input + 11 * stride);                             \
+    const T_VEC i12 = load(input + 12 * stride);                             \
+    const T_VEC i13 = load(input + 13 * stride);                             \
+    const T_VEC i14 = load(input + 14 * stride);                             \
+    const T_VEC i15 = load(input + 15 * stride);                             \
+    const T_VEC i16 = load(input + 16 * stride);                             \
+    const T_VEC i17 = load(input + 17 * stride);                             \
+    const T_VEC i18 = load(input + 18 * stride);                             \
+    const T_VEC i19 = load(input + 19 * stride);                             \
+    const T_VEC i20 = load(input + 20 * stride);                             \
+    const T_VEC i21 = load(input + 21 * stride);                             \
+    const T_VEC i22 = load(input + 22 * stride);                             \
+    const T_VEC i23 = load(input + 23 * stride);                             \
+    const T_VEC i24 = load(input + 24 * stride);                             \
+    const T_VEC i25 = load(input + 25 * stride);                             \
+    const T_VEC i26 = load(input + 26 * stride);                             \
+    const T_VEC i27 = load(input + 27 * stride);                             \
+    const T_VEC i28 = load(input + 28 * stride);                             \
+    const T_VEC i29 = load(input + 29 * stride);                             \
+    const T_VEC i30 = load(input + 30 * stride);                             \
+    const T_VEC i31 = load(input + 31 * stride);                             \
+    const T_VEC w0 = add(i0, i16);                                           \
+    const T_VEC w1 = sub(i0, i16);                                           \
+    const T_VEC w2 = add(i8, i24);                                           \
+    const T_VEC w3 = sub(i8, i24);                                           \
+    const T_VEC w4 = add(w0, w2);                                            \
+    const T_VEC w5 = sub(w0, w2);                                            \
+    const T_VEC w7 = add(i4, i20);                                           \
+    const T_VEC w8 = sub(i4, i20);                                           \
+    const T_VEC w9 = add(i12, i28);                                          \
+    const T_VEC w10 = sub(i12, i28);                                         \
+    const T_VEC w11 = add(w7, w9);                                           \
+    const T_VEC w12 = sub(w7, w9);                                           \
+    const T_VEC w14 = add(w4, w11);                                          \
+    const T_VEC w15 = sub(w4, w11);                                          \
+    const T_VEC w16[2] = { add(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(sub(kWeight0, w3),                            \
+                               mul(kWeight2, add(w10, w8))) };               \
+    const T_VEC w18[2] = { sub(w1, mul(kWeight2, sub(w8, w10))),             \
+                           sub(w3, mul(kWeight2, add(w10, w8))) };           \
+    const T_VEC w19 = add(i2, i18);                                          \
+    const T_VEC w20 = sub(i2, i18);                                          \
+    const T_VEC w21 = add(i10, i26);                                         \
+    const T_VEC w22 = sub(i10, i26);                                         \
+    const T_VEC w23 = add(w19, w21);                                         \
+    const T_VEC w24 = sub(w19, w21);                                         \
+    const T_VEC w26 = add(i6, i22);                                          \
+    const T_VEC w27 = sub(i6, i22);                                          \
+    const T_VEC w28 = add(i14, i30);                                         \
+    const T_VEC w29 = sub(i14, i30);                                         \
+    const T_VEC w30 = add(w26, w28);                                         \
+    const T_VEC w31 = sub(w26, w28);                                         \
+    const T_VEC w33 = add(w23, w30);                                         \
+    const T_VEC w34 = sub(w23, w30);                                         \
+    const T_VEC w35[2] = { add(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(sub(kWeight0, w22),                           \
+                               mul(kWeight2, add(w29, w27))) };              \
+    const T_VEC w37[2] = { sub(w20, mul(kWeight2, sub(w27, w29))),           \
+                           sub(w22, mul(kWeight2, add(w29, w27))) };         \
+    const T_VEC w38 = add(w14, w33);                                         \
+    const T_VEC w39 = sub(w14, w33);                                         \
+    const T_VEC w40[2] = {                                                   \
+      add(w16[0], add(mul(kWeight3, w35[0]), mul(kWeight4, w35[1]))),        \
+      add(w16[1], sub(mul(kWeight3, w35[1]), mul(kWeight4, w35[0])))         \
+    };                                                                       \
+    const T_VEC w41[2] = { add(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(sub(kWeight0, w12),                           \
+                               mul(kWeight2, add(w31, w24))) };              \
+    const T_VEC w42[2] = {                                                   \
+      add(w18[0], add(mul(kWeight4, w37[0]), mul(kWeight3, w37[1]))),        \
+      add(w18[1], sub(mul(kWeight4, w37[1]), mul(kWeight3, w37[0])))         \
+    };                                                                       \
+    const T_VEC w44[2] = {                                                   \
+      add(w18[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w37[0])), mul(kWeight3, w37[1]))), \
+      sub(sub(kWeight0, w18[1]),                                             \
+          sub(mul(kWeight3, w37[0]), mul(kWeight4, w37[1])))                 \
+    };                                                                       \
+    const T_VEC w45[2] = { sub(w5, mul(kWeight2, sub(w24, w31))),            \
+                           sub(w12, mul(kWeight2, add(w31, w24))) };         \
+    const T_VEC w46[2] = {                                                   \
+      add(w16[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w35[0])), mul(kWeight4, w35[1]))), \
+      sub(sub(kWeight0, w16[1]),                                             \
+          sub(mul(kWeight4, w35[0]), mul(kWeight3, w35[1])))                 \
+    };                                                                       \
+    const T_VEC w47 = add(i1, i17);                                          \
+    const T_VEC w48 = sub(i1, i17);                                          \
+    const T_VEC w49 = add(i9, i25);                                          \
+    const T_VEC w50 = sub(i9, i25);                                          \
+    const T_VEC w51 = add(w47, w49);                                         \
+    const T_VEC w52 = sub(w47, w49);                                         \
+    const T_VEC w54 = add(i5, i21);                                          \
+    const T_VEC w55 = sub(i5, i21);                                          \
+    const T_VEC w56 = add(i13, i29);                                         \
+    const T_VEC w57 = sub(i13, i29);                                         \
+    const T_VEC w58 = add(w54, w56);                                         \
+    const T_VEC w59 = sub(w54, w56);                                         \
+    const T_VEC w61 = add(w51, w58);                                         \
+    const T_VEC w62 = sub(w51, w58);                                         \
+    const T_VEC w63[2] = { add(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(sub(kWeight0, w50),                           \
+                               mul(kWeight2, add(w57, w55))) };              \
+    const T_VEC w65[2] = { sub(w48, mul(kWeight2, sub(w55, w57))),           \
+                           sub(w50, mul(kWeight2, add(w57, w55))) };         \
+    const T_VEC w66 = add(i3, i19);                                          \
+    const T_VEC w67 = sub(i3, i19);                                          \
+    const T_VEC w68 = add(i11, i27);                                         \
+    const T_VEC w69 = sub(i11, i27);                                         \
+    const T_VEC w70 = add(w66, w68);                                         \
+    const T_VEC w71 = sub(w66, w68);                                         \
+    const T_VEC w73 = add(i7, i23);                                          \
+    const T_VEC w74 = sub(i7, i23);                                          \
+    const T_VEC w75 = add(i15, i31);                                         \
+    const T_VEC w76 = sub(i15, i31);                                         \
+    const T_VEC w77 = add(w73, w75);                                         \
+    const T_VEC w78 = sub(w73, w75);                                         \
+    const T_VEC w80 = add(w70, w77);                                         \
+    const T_VEC w81 = sub(w70, w77);                                         \
+    const T_VEC w82[2] = { add(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(sub(kWeight0, w69),                           \
+                               mul(kWeight2, add(w76, w74))) };              \
+    const T_VEC w84[2] = { sub(w67, mul(kWeight2, sub(w74, w76))),           \
+                           sub(w69, mul(kWeight2, add(w76, w74))) };         \
+    const T_VEC w85 = add(w61, w80);                                         \
+    const T_VEC w86 = sub(w61, w80);                                         \
+    const T_VEC w87[2] = {                                                   \
+      add(w63[0], add(mul(kWeight3, w82[0]), mul(kWeight4, w82[1]))),        \
+      add(w63[1], sub(mul(kWeight3, w82[1]), mul(kWeight4, w82[0])))         \
+    };                                                                       \
+    const T_VEC w88[2] = { add(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(sub(kWeight0, w59),                           \
+                               mul(kWeight2, add(w78, w71))) };              \
+    const T_VEC w89[2] = {                                                   \
+      add(w65[0], add(mul(kWeight4, w84[0]), mul(kWeight3, w84[1]))),        \
+      add(w65[1], sub(mul(kWeight4, w84[1]), mul(kWeight3, w84[0])))         \
+    };                                                                       \
+    const T_VEC w91[2] = {                                                   \
+      add(w65[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight4, w84[0])), mul(kWeight3, w84[1]))), \
+      sub(sub(kWeight0, w65[1]),                                             \
+          sub(mul(kWeight3, w84[0]), mul(kWeight4, w84[1])))                 \
+    };                                                                       \
+    const T_VEC w92[2] = { sub(w52, mul(kWeight2, sub(w71, w78))),           \
+                           sub(w59, mul(kWeight2, add(w78, w71))) };         \
+    const T_VEC w93[2] = {                                                   \
+      add(w63[0],                                                            \
+          sub(sub(kWeight0, mul(kWeight3, w82[0])), mul(kWeight4, w82[1]))), \
+      sub(sub(kWeight0, w63[1]),                                             \
+          sub(mul(kWeight4, w82[0]), mul(kWeight3, w82[1])))                 \
+    };                                                                       \
+    store(output + 0 * stride, add(w38, w85));                               \
+    store(output + 1 * stride,                                               \
+          add(w40[0], add(mul(kWeight5, w87[0]), mul(kWeight6, w87[1]))));   \
+    store(output + 2 * stride,                                               \
+          add(w41[0], add(mul(kWeight3, w88[0]), mul(kWeight4, w88[1]))));   \
+    store(output + 3 * stride,                                               \
+          add(w42[0], add(mul(kWeight7, w89[0]), mul(kWeight8, w89[1]))));   \
+    store(output + 4 * stride, add(w15, mul(kWeight2, sub(w62, w81))));      \
+    store(output + 5 * stride,                                               \
+          add(w44[0], add(mul(kWeight8, w91[0]), mul(kWeight7, w91[1]))));   \
+    store(output + 6 * stride,                                               \
+          add(w45[0], add(mul(kWeight4, w92[0]), mul(kWeight3, w92[1]))));   \
+    store(output + 7 * stride,                                               \
+          add(w46[0], add(mul(kWeight6, w93[0]), mul(kWeight5, w93[1]))));   \
+    store(output + 8 * stride, w39);                                         \
+    store(output + 9 * stride,                                               \
+          add(w46[0], sub(sub(kWeight0, mul(kWeight6, w93[0])),              \
+                          mul(kWeight5, w93[1]))));                          \
+    store(output + 10 * stride,                                              \
+          add(w45[0], sub(sub(kWeight0, mul(kWeight4, w92[0])),              \
+                          mul(kWeight3, w92[1]))));                          \
+    store(output + 11 * stride,                                              \
+          add(w44[0], sub(sub(kWeight0, mul(kWeight8, w91[0])),              \
+                          mul(kWeight7, w91[1]))));                          \
+    store(output + 12 * stride, sub(w15, mul(kWeight2, sub(w62, w81))));     \
+    store(output + 13 * stride,                                              \
+          add(w42[0], sub(sub(kWeight0, mul(kWeight7, w89[0])),              \
+                          mul(kWeight8, w89[1]))));                          \
+    store(output + 14 * stride,                                              \
+          add(w41[0], sub(sub(kWeight0, mul(kWeight3, w88[0])),              \
+                          mul(kWeight4, w88[1]))));                          \
+    store(output + 15 * stride,                                              \
+          add(w40[0], sub(sub(kWeight0, mul(kWeight5, w87[0])),              \
+                          mul(kWeight6, w87[1]))));                          \
+    store(output + 16 * stride, sub(w38, w85));                              \
+    store(output + 17 * stride,                                              \
+          add(w40[1], sub(mul(kWeight5, w87[1]), mul(kWeight6, w87[0]))));   \
+    store(output + 18 * stride,                                              \
+          add(w41[1], sub(mul(kWeight3, w88[1]), mul(kWeight4, w88[0]))));   \
+    store(output + 19 * stride,                                              \
+          add(w42[1], sub(mul(kWeight7, w89[1]), mul(kWeight8, w89[0]))));   \
+    store(output + 20 * stride,                                              \
+          sub(sub(kWeight0, w34), mul(kWeight2, add(w81, w62))));            \
+    store(output + 21 * stride,                                              \
+          add(w44[1], sub(mul(kWeight8, w91[1]), mul(kWeight7, w91[0]))));   \
+    store(output + 22 * stride,                                              \
+          add(w45[1], sub(mul(kWeight4, w92[1]), mul(kWeight3, w92[0]))));   \
+    store(output + 23 * stride,                                              \
+          add(w46[1], sub(mul(kWeight6, w93[1]), mul(kWeight5, w93[0]))));   \
+    store(output + 24 * stride, sub(kWeight0, w86));                         \
+    store(output + 25 * stride,                                              \
+          sub(sub(kWeight0, w46[1]),                                         \
+              sub(mul(kWeight5, w93[0]), mul(kWeight6, w93[1]))));           \
+    store(output + 26 * stride,                                              \
+          sub(sub(kWeight0, w45[1]),                                         \
+              sub(mul(kWeight3, w92[0]), mul(kWeight4, w92[1]))));           \
+    store(output + 27 * stride,                                              \
+          sub(sub(kWeight0, w44[1]),                                         \
+              sub(mul(kWeight7, w91[0]), mul(kWeight8, w91[1]))));           \
+    store(output + 28 * stride, sub(w34, mul(kWeight2, add(w81, w62))));     \
+    store(output + 29 * stride,                                              \
+          sub(sub(kWeight0, w42[1]),                                         \
+              sub(mul(kWeight8, w89[0]), mul(kWeight7, w89[1]))));           \
+    store(output + 30 * stride,                                              \
+          sub(sub(kWeight0, w41[1]),                                         \
+              sub(mul(kWeight4, w88[0]), mul(kWeight3, w88[1]))));           \
+    store(output + 31 * stride,                                              \
+          sub(sub(kWeight0, w40[1]),                                         \
+              sub(mul(kWeight6, w87[0]), mul(kWeight5, w87[1]))));           \
+  }
+
+#define GEN_IFFT_2(ret, suffix, T, T_VEC, load, store)               \
+  ret aom_ifft1d_2_##suffix(const T *input, T *output, int stride) { \
+    const T_VEC i0 = load(input + 0 * stride);                       \
+    const T_VEC i1 = load(input + 1 * stride);                       \
+    store(output + 0 * stride, i0 + i1);                             \
+    store(output + 1 * stride, i0 - i1);                             \
+  }
+
+#define GEN_IFFT_4(ret, suffix, T, T_VEC, load, store, constant, add, sub) \
+  ret aom_ifft1d_4_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC w2 = add(i0, i2);                                          \
+    const T_VEC w3 = sub(i0, i2);                                          \
+    const T_VEC w4[2] = { add(i1, i1), sub(i3, i3) };                      \
+    const T_VEC w5[2] = { sub(i1, i1), sub(sub(kWeight0, i3), i3) };       \
+    store(output + 0 * stride, add(w2, w4[0]));                            \
+    store(output + 1 * stride, add(w3, w5[1]));                            \
+    store(output + 2 * stride, sub(w2, w4[0]));                            \
+    store(output + 3 * stride, sub(w3, w5[1]));                            \
+  }
+
+#define GEN_IFFT_8(ret, suffix, T, T_VEC, load, store, constant, add, sub, \
+                   mul)                                                    \
+  ret aom_ifft1d_8_##suffix(const T *input, T *output, int stride) {       \
+    const T_VEC kWeight0 = constant(0.0f);                                 \
+    const T_VEC kWeight2 = constant(0.707107f);                            \
+    const T_VEC i0 = load(input + 0 * stride);                             \
+    const T_VEC i1 = load(input + 1 * stride);                             \
+    const T_VEC i2 = load(input + 2 * stride);                             \
+    const T_VEC i3 = load(input + 3 * stride);                             \
+    const T_VEC i4 = load(input + 4 * stride);                             \
+    const T_VEC i5 = load(input + 5 * stride);                             \
+    const T_VEC i6 = load(input + 6 * stride);                             \
+    const T_VEC i7 = load(input + 7 * stride);                             \
+    const T_VEC w6 = add(i0, i4);                                          \
+    const T_VEC w7 = sub(i0, i4);                                          \
+    const T_VEC w8[2] = { add(i2, i2), sub(i6, i6) };                      \
+    const T_VEC w9[2] = { sub(i2, i2), sub(sub(kWeight0, i6), i6) };       \
+    const T_VEC w10[2] = { add(w6, w8[0]), w8[1] };                        \
+    const T_VEC w11[2] = { sub(w6, w8[0]), sub(kWeight0, w8[1]) };         \
+    const T_VEC w12[2] = { add(w7, w9[1]), sub(kWeight0, w9[0]) };         \
+    const T_VEC w13[2] = { sub(w7, w9[1]), w9[0] };                        \
+    const T_VEC w14[2] = { add(i1, i3), sub(i7, i5) };                     \
+    const T_VEC w15[2] = { sub(i1, i3), sub(sub(kWeight0, i5), i7) };      \
+    const T_VEC w16[2] = { add(i3, i1), sub(i5, i7) };                     \
+    const T_VEC w17[2] = { sub(i3, i1), sub(sub(kWeight0, i7), i5) };      \
+    const T_VEC w18[2] = { add(w14[0], w16[0]), add(w14[1], w16[1]) };     \
+    const T_VEC w19[2] = { sub(w14[0], w16[0]), sub(w14[1], w16[1]) };     \
+    const T_VEC w20[2] = { add(w15[0], w17[1]), sub(w15[1], w17[0]) };     \
+    const T_VEC w21[2] = { sub(w15[0], w17[1]), add(w15[1], w17[0]) };     \
+    store(output + 0 * stride, add(w10[0], w18[0]));                       \
+    store(output + 1 * stride,                                             \
+          add(w12[0], mul(kWeight2, add(w20[0], w20[1]))));                \
+    store(output + 2 * stride, add(w11[0], w19[1]));                       \
+    store(output + 3 * stride,                                             \
+          sub(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+    store(output + 4 * stride, sub(w10[0], w18[0]));                       \
+    store(output + 5 * stride,                                             \
+          add(w12[0], sub(sub(kWeight0, mul(kWeight2, w20[0])),            \
+                          mul(kWeight2, w20[1]))));                        \
+    store(output + 6 * stride, sub(w11[0], w19[1]));                       \
+    store(output + 7 * stride,                                             \
+          add(w13[0], mul(kWeight2, sub(w21[0], w21[1]))));                \
+  }
+
+#define GEN_IFFT_16(ret, suffix, T, T_VEC, load, store, constant, add, sub,   \
+                    mul)                                                      \
+  ret aom_ifft1d_16_##suffix(const T *input, T *output, int stride) {         \
+    const T_VEC kWeight0 = constant(0.0f);                                    \
+    const T_VEC kWeight2 = constant(0.707107f);                               \
+    const T_VEC kWeight3 = constant(0.92388f);                                \
+    const T_VEC kWeight4 = constant(0.382683f);                               \
+    const T_VEC i0 = load(input + 0 * stride);                                \
+    const T_VEC i1 = load(input + 1 * stride);                                \
+    const T_VEC i2 = load(input + 2 * stride);                                \
+    const T_VEC i3 = load(input + 3 * stride);                                \
+    const T_VEC i4 = load(input + 4 * stride);                                \
+    const T_VEC i5 = load(input + 5 * stride);                                \
+    const T_VEC i6 = load(input + 6 * stride);                                \
+    const T_VEC i7 = load(input + 7 * stride);                                \
+    const T_VEC i8 = load(input + 8 * stride);                                \
+    const T_VEC i9 = load(input + 9 * stride);                                \
+    const T_VEC i10 = load(input + 10 * stride);                              \
+    const T_VEC i11 = load(input + 11 * stride);                              \
+    const T_VEC i12 = load(input + 12 * stride);                              \
+    const T_VEC i13 = load(input + 13 * stride);                              \
+    const T_VEC i14 = load(input + 14 * stride);                              \
+    const T_VEC i15 = load(input + 15 * stride);                              \
+    const T_VEC w14 = add(i0, i8);                                            \
+    const T_VEC w15 = sub(i0, i8);                                            \
+    const T_VEC w16[2] = { add(i4, i4), sub(i12, i12) };                      \
+    const T_VEC w17[2] = { sub(i4, i4), sub(sub(kWeight0, i12), i12) };       \
+    const T_VEC w18[2] = { add(w14, w16[0]), w16[1] };                        \
+    const T_VEC w19[2] = { sub(w14, w16[0]), sub(kWeight0, w16[1]) };         \
+    const T_VEC w20[2] = { add(w15, w17[1]), sub(kWeight0, w17[0]) };         \
+    const T_VEC w21[2] = { sub(w15, w17[1]), w17[0] };                        \
+    const T_VEC w22[2] = { add(i2, i6), sub(i14, i10) };                      \
+    const T_VEC w23[2] = { sub(i2, i6), sub(sub(kWeight0, i10), i14) };       \
+    const T_VEC w24[2] = { add(i6, i2), sub(i10, i14) };                      \
+    const T_VEC w25[2] = { sub(i6, i2), sub(sub(kWeight0, i14), i10) };       \
+    const T_VEC w26[2] = { add(w22[0], w24[0]), add(w22[1], w24[1]) };        \
+    const T_VEC w27[2] = { sub(w22[0], w24[0]), sub(w22[1], w24[1]) };        \
+    const T_VEC w28[2] = { add(w23[0], w25[1]), sub(w23[1], w25[0]) };        \
+    const T_VEC w29[2] = { sub(w23[0], w25[1]), add(w23[1], w25[0]) };        \
+    const T_VEC w30[2] = { add(w18[0], w26[0]), add(w18[1], w26[1]) };        \
+    const T_VEC w31[2] = { sub(w18[0], w26[0]), sub(w18[1], w26[1]) };        \
+    const T_VEC w32[2] = { add(w20[0], mul(kWeight2, add(w28[0], w28[1]))),   \
+                           add(w20[1], mul(kWeight2, sub(w28[1], w28[0]))) }; \
+    const T_VEC w33[2] = { add(w20[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w28[0])),      \
+                                   mul(kWeight2, w28[1]))),                   \
+                           add(w20[1], mul(kWeight2, sub(w28[0], w28[1]))) }; \
+    const T_VEC w34[2] = { add(w19[0], w27[1]), sub(w19[1], w27[0]) };        \
+    const T_VEC w35[2] = { sub(w19[0], w27[1]), add(w19[1], w27[0]) };        \
+    const T_VEC w36[2] = { sub(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           sub(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w37[2] = { add(w21[0], mul(kWeight2, sub(w29[0], w29[1]))),   \
+                           add(w21[1], mul(kWeight2, add(w29[1], w29[0]))) }; \
+    const T_VEC w38[2] = { add(i1, i7), sub(i15, i9) };                       \
+    const T_VEC w39[2] = { sub(i1, i7), sub(sub(kWeight0, i9), i15) };        \
+    const T_VEC w40[2] = { add(i5, i3), sub(i11, i13) };                      \
+    const T_VEC w41[2] = { sub(i5, i3), sub(sub(kWeight0, i13), i11) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };        \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };        \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };        \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };        \
+    const T_VEC w46[2] = { add(i3, i5), sub(i13, i11) };                      \
+    const T_VEC w47[2] = { sub(i3, i5), sub(sub(kWeight0, i11), i13) };       \
+    const T_VEC w48[2] = { add(i7, i1), sub(i9, i15) };                       \
+    const T_VEC w49[2] = { sub(i7, i1), sub(sub(kWeight0, i15), i9) };        \
+    const T_VEC w50[2] = { add(w46[0], w48[0]), add(w46[1], w48[1]) };        \
+    const T_VEC w51[2] = { sub(w46[0], w48[0]), sub(w46[1], w48[1]) };        \
+    const T_VEC w52[2] = { add(w47[0], w49[1]), sub(w47[1], w49[0]) };        \
+    const T_VEC w53[2] = { sub(w47[0], w49[1]), add(w47[1], w49[0]) };        \
+    const T_VEC w54[2] = { add(w42[0], w50[0]), add(w42[1], w50[1]) };        \
+    const T_VEC w55[2] = { sub(w42[0], w50[0]), sub(w42[1], w50[1]) };        \
+    const T_VEC w56[2] = { add(w44[0], mul(kWeight2, add(w52[0], w52[1]))),   \
+                           add(w44[1], mul(kWeight2, sub(w52[1], w52[0]))) }; \
+    const T_VEC w57[2] = { add(w44[0],                                        \
+                               sub(sub(kWeight0, mul(kWeight2, w52[0])),      \
+                                   mul(kWeight2, w52[1]))),                   \
+                           add(w44[1], mul(kWeight2, sub(w52[0], w52[1]))) }; \
+    const T_VEC w58[2] = { add(w43[0], w51[1]), sub(w43[1], w51[0]) };        \
+    const T_VEC w59[2] = { sub(w43[0], w51[1]), add(w43[1], w51[0]) };        \
+    const T_VEC w60[2] = { sub(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           sub(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    const T_VEC w61[2] = { add(w45[0], mul(kWeight2, sub(w53[0], w53[1]))),   \
+                           add(w45[1], mul(kWeight2, add(w53[1], w53[0]))) }; \
+    store(output + 0 * stride, add(w30[0], w54[0]));                          \
+    store(output + 1 * stride,                                                \
+          add(w32[0], add(mul(kWeight3, w56[0]), mul(kWeight4, w56[1]))));    \
+    store(output + 2 * stride,                                                \
+          add(w34[0], mul(kWeight2, add(w58[0], w58[1]))));                   \
+    store(output + 3 * stride,                                                \
+          add(w36[0], add(mul(kWeight4, w60[0]), mul(kWeight3, w60[1]))));    \
+    store(output + 4 * stride, add(w31[0], w55[1]));                          \
+    store(output + 5 * stride,                                                \
+          sub(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 6 * stride,                                                \
+          sub(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 7 * stride,                                                \
+          sub(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+    store(output + 8 * stride, sub(w30[0], w54[0]));                          \
+    store(output + 9 * stride,                                                \
+          add(w32[0], sub(sub(kWeight0, mul(kWeight3, w56[0])),               \
+                          mul(kWeight4, w56[1]))));                           \
+    store(output + 10 * stride,                                               \
+          add(w34[0], sub(sub(kWeight0, mul(kWeight2, w58[0])),               \
+                          mul(kWeight2, w58[1]))));                           \
+    store(output + 11 * stride,                                               \
+          add(w36[0], sub(sub(kWeight0, mul(kWeight4, w60[0])),               \
+                          mul(kWeight3, w60[1]))));                           \
+    store(output + 12 * stride, sub(w31[0], w55[1]));                         \
+    store(output + 13 * stride,                                               \
+          add(w33[0], sub(mul(kWeight4, w57[0]), mul(kWeight3, w57[1]))));    \
+    store(output + 14 * stride,                                               \
+          add(w35[0], mul(kWeight2, sub(w59[0], w59[1]))));                   \
+    store(output + 15 * stride,                                               \
+          add(w37[0], sub(mul(kWeight3, w61[0]), mul(kWeight4, w61[1]))));    \
+  }
+#define GEN_IFFT_32(ret, suffix, T, T_VEC, load, store, constant, add, sub,    \
+                    mul)                                                       \
+  ret aom_ifft1d_32_##suffix(const T *input, T *output, int stride) {          \
+    const T_VEC kWeight0 = constant(0.0f);                                     \
+    const T_VEC kWeight2 = constant(0.707107f);                                \
+    const T_VEC kWeight3 = constant(0.92388f);                                 \
+    const T_VEC kWeight4 = constant(0.382683f);                                \
+    const T_VEC kWeight5 = constant(0.980785f);                                \
+    const T_VEC kWeight6 = constant(0.19509f);                                 \
+    const T_VEC kWeight7 = constant(0.83147f);                                 \
+    const T_VEC kWeight8 = constant(0.55557f);                                 \
+    const T_VEC i0 = load(input + 0 * stride);                                 \
+    const T_VEC i1 = load(input + 1 * stride);                                 \
+    const T_VEC i2 = load(input + 2 * stride);                                 \
+    const T_VEC i3 = load(input + 3 * stride);                                 \
+    const T_VEC i4 = load(input + 4 * stride);                                 \
+    const T_VEC i5 = load(input + 5 * stride);                                 \
+    const T_VEC i6 = load(input + 6 * stride);                                 \
+    const T_VEC i7 = load(input + 7 * stride);                                 \
+    const T_VEC i8 = load(input + 8 * stride);                                 \
+    const T_VEC i9 = load(input + 9 * stride);                                 \
+    const T_VEC i10 = load(input + 10 * stride);                               \
+    const T_VEC i11 = load(input + 11 * stride);                               \
+    const T_VEC i12 = load(input + 12 * stride);                               \
+    const T_VEC i13 = load(input + 13 * stride);                               \
+    const T_VEC i14 = load(input + 14 * stride);                               \
+    const T_VEC i15 = load(input + 15 * stride);                               \
+    const T_VEC i16 = load(input + 16 * stride);                               \
+    const T_VEC i17 = load(input + 17 * stride);                               \
+    const T_VEC i18 = load(input + 18 * stride);                               \
+    const T_VEC i19 = load(input + 19 * stride);                               \
+    const T_VEC i20 = load(input + 20 * stride);                               \
+    const T_VEC i21 = load(input + 21 * stride);                               \
+    const T_VEC i22 = load(input + 22 * stride);                               \
+    const T_VEC i23 = load(input + 23 * stride);                               \
+    const T_VEC i24 = load(input + 24 * stride);                               \
+    const T_VEC i25 = load(input + 25 * stride);                               \
+    const T_VEC i26 = load(input + 26 * stride);                               \
+    const T_VEC i27 = load(input + 27 * stride);                               \
+    const T_VEC i28 = load(input + 28 * stride);                               \
+    const T_VEC i29 = load(input + 29 * stride);                               \
+    const T_VEC i30 = load(input + 30 * stride);                               \
+    const T_VEC i31 = load(input + 31 * stride);                               \
+    const T_VEC w30 = add(i0, i16);                                            \
+    const T_VEC w31 = sub(i0, i16);                                            \
+    const T_VEC w32[2] = { add(i8, i8), sub(i24, i24) };                       \
+    const T_VEC w33[2] = { sub(i8, i8), sub(sub(kWeight0, i24), i24) };        \
+    const T_VEC w34[2] = { add(w30, w32[0]), w32[1] };                         \
+    const T_VEC w35[2] = { sub(w30, w32[0]), sub(kWeight0, w32[1]) };          \
+    const T_VEC w36[2] = { add(w31, w33[1]), sub(kWeight0, w33[0]) };          \
+    const T_VEC w37[2] = { sub(w31, w33[1]), w33[0] };                         \
+    const T_VEC w38[2] = { add(i4, i12), sub(i28, i20) };                      \
+    const T_VEC w39[2] = { sub(i4, i12), sub(sub(kWeight0, i20), i28) };       \
+    const T_VEC w40[2] = { add(i12, i4), sub(i20, i28) };                      \
+    const T_VEC w41[2] = { sub(i12, i4), sub(sub(kWeight0, i28), i20) };       \
+    const T_VEC w42[2] = { add(w38[0], w40[0]), add(w38[1], w40[1]) };         \
+    const T_VEC w43[2] = { sub(w38[0], w40[0]), sub(w38[1], w40[1]) };         \
+    const T_VEC w44[2] = { add(w39[0], w41[1]), sub(w39[1], w41[0]) };         \
+    const T_VEC w45[2] = { sub(w39[0], w41[1]), add(w39[1], w41[0]) };         \
+    const T_VEC w46[2] = { add(w34[0], w42[0]), add(w34[1], w42[1]) };         \
+    const T_VEC w47[2] = { sub(w34[0], w42[0]), sub(w34[1], w42[1]) };         \
+    const T_VEC w48[2] = { add(w36[0], mul(kWeight2, add(w44[0], w44[1]))),    \
+                           add(w36[1], mul(kWeight2, sub(w44[1], w44[0]))) };  \
+    const T_VEC w49[2] = { add(w36[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w44[0])),       \
+                                   mul(kWeight2, w44[1]))),                    \
+                           add(w36[1], mul(kWeight2, sub(w44[0], w44[1]))) };  \
+    const T_VEC w50[2] = { add(w35[0], w43[1]), sub(w35[1], w43[0]) };         \
+    const T_VEC w51[2] = { sub(w35[0], w43[1]), add(w35[1], w43[0]) };         \
+    const T_VEC w52[2] = { sub(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           sub(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w53[2] = { add(w37[0], mul(kWeight2, sub(w45[0], w45[1]))),    \
+                           add(w37[1], mul(kWeight2, add(w45[1], w45[0]))) };  \
+    const T_VEC w54[2] = { add(i2, i14), sub(i30, i18) };                      \
+    const T_VEC w55[2] = { sub(i2, i14), sub(sub(kWeight0, i18), i30) };       \
+    const T_VEC w56[2] = { add(i10, i6), sub(i22, i26) };                      \
+    const T_VEC w57[2] = { sub(i10, i6), sub(sub(kWeight0, i26), i22) };       \
+    const T_VEC w58[2] = { add(w54[0], w56[0]), add(w54[1], w56[1]) };         \
+    const T_VEC w59[2] = { sub(w54[0], w56[0]), sub(w54[1], w56[1]) };         \
+    const T_VEC w60[2] = { add(w55[0], w57[1]), sub(w55[1], w57[0]) };         \
+    const T_VEC w61[2] = { sub(w55[0], w57[1]), add(w55[1], w57[0]) };         \
+    const T_VEC w62[2] = { add(i6, i10), sub(i26, i22) };                      \
+    const T_VEC w63[2] = { sub(i6, i10), sub(sub(kWeight0, i22), i26) };       \
+    const T_VEC w64[2] = { add(i14, i2), sub(i18, i30) };                      \
+    const T_VEC w65[2] = { sub(i14, i2), sub(sub(kWeight0, i30), i18) };       \
+    const T_VEC w66[2] = { add(w62[0], w64[0]), add(w62[1], w64[1]) };         \
+    const T_VEC w67[2] = { sub(w62[0], w64[0]), sub(w62[1], w64[1]) };         \
+    const T_VEC w68[2] = { add(w63[0], w65[1]), sub(w63[1], w65[0]) };         \
+    const T_VEC w69[2] = { sub(w63[0], w65[1]), add(w63[1], w65[0]) };         \
+    const T_VEC w70[2] = { add(w58[0], w66[0]), add(w58[1], w66[1]) };         \
+    const T_VEC w71[2] = { sub(w58[0], w66[0]), sub(w58[1], w66[1]) };         \
+    const T_VEC w72[2] = { add(w60[0], mul(kWeight2, add(w68[0], w68[1]))),    \
+                           add(w60[1], mul(kWeight2, sub(w68[1], w68[0]))) };  \
+    const T_VEC w73[2] = { add(w60[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w68[0])),       \
+                                   mul(kWeight2, w68[1]))),                    \
+                           add(w60[1], mul(kWeight2, sub(w68[0], w68[1]))) };  \
+    const T_VEC w74[2] = { add(w59[0], w67[1]), sub(w59[1], w67[0]) };         \
+    const T_VEC w75[2] = { sub(w59[0], w67[1]), add(w59[1], w67[0]) };         \
+    const T_VEC w76[2] = { sub(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           sub(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w77[2] = { add(w61[0], mul(kWeight2, sub(w69[0], w69[1]))),    \
+                           add(w61[1], mul(kWeight2, add(w69[1], w69[0]))) };  \
+    const T_VEC w78[2] = { add(w46[0], w70[0]), add(w46[1], w70[1]) };         \
+    const T_VEC w79[2] = { sub(w46[0], w70[0]), sub(w46[1], w70[1]) };         \
+    const T_VEC w80[2] = {                                                     \
+      add(w48[0], add(mul(kWeight3, w72[0]), mul(kWeight4, w72[1]))),          \
+      add(w48[1], sub(mul(kWeight3, w72[1]), mul(kWeight4, w72[0])))           \
+    };                                                                         \
+    const T_VEC w81[2] = {                                                     \
+      add(w48[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w72[0])), mul(kWeight4, w72[1]))),   \
+      add(w48[1], sub(mul(kWeight4, w72[0]), mul(kWeight3, w72[1])))           \
+    };                                                                         \
+    const T_VEC w82[2] = { add(w50[0], mul(kWeight2, add(w74[0], w74[1]))),    \
+                           add(w50[1], mul(kWeight2, sub(w74[1], w74[0]))) };  \
+    const T_VEC w83[2] = { add(w50[0],                                         \
+                               sub(sub(kWeight0, mul(kWeight2, w74[0])),       \
+                                   mul(kWeight2, w74[1]))),                    \
+                           add(w50[1], mul(kWeight2, sub(w74[0], w74[1]))) };  \
+    const T_VEC w84[2] = {                                                     \
+      add(w52[0], add(mul(kWeight4, w76[0]), mul(kWeight3, w76[1]))),          \
+      add(w52[1], sub(mul(kWeight4, w76[1]), mul(kWeight3, w76[0])))           \
+    };                                                                         \
+    const T_VEC w85[2] = {                                                     \
+      add(w52[0],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w76[0])), mul(kWeight3, w76[1]))),   \
+      add(w52[1], sub(mul(kWeight3, w76[0]), mul(kWeight4, w76[1])))           \
+    };                                                                         \
+    const T_VEC w86[2] = { add(w47[0], w71[1]), sub(w47[1], w71[0]) };         \
+    const T_VEC w87[2] = { sub(w47[0], w71[1]), add(w47[1], w71[0]) };         \
+    const T_VEC w88[2] = {                                                     \
+      sub(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight4, w73[1])), mul(kWeight3, w73[0])))    \
+    };                                                                         \
+    const T_VEC w89[2] = {                                                     \
+      add(w49[0], sub(mul(kWeight4, w73[0]), mul(kWeight3, w73[1]))),          \
+      add(w49[1], add(mul(kWeight4, w73[1]), mul(kWeight3, w73[0])))           \
+    };                                                                         \
+    const T_VEC w90[2] = { sub(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           sub(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w91[2] = { add(w51[0], mul(kWeight2, sub(w75[0], w75[1]))),    \
+                           add(w51[1], mul(kWeight2, add(w75[1], w75[0]))) };  \
+    const T_VEC w92[2] = {                                                     \
+      sub(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1],                                                              \
+          sub(sub(kWeight0, mul(kWeight3, w77[1])), mul(kWeight4, w77[0])))    \
+    };                                                                         \
+    const T_VEC w93[2] = {                                                     \
+      add(w53[0], sub(mul(kWeight3, w77[0]), mul(kWeight4, w77[1]))),          \
+      add(w53[1], add(mul(kWeight3, w77[1]), mul(kWeight4, w77[0])))           \
+    };                                                                         \
+    const T_VEC w94[2] = { add(i1, i15), sub(i31, i17) };                      \
+    const T_VEC w95[2] = { sub(i1, i15), sub(sub(kWeight0, i17), i31) };       \
+    const T_VEC w96[2] = { add(i9, i7), sub(i23, i25) };                       \
+    const T_VEC w97[2] = { sub(i9, i7), sub(sub(kWeight0, i25), i23) };        \
+    const T_VEC w98[2] = { add(w94[0], w96[0]), add(w94[1], w96[1]) };         \
+    const T_VEC w99[2] = { sub(w94[0], w96[0]), sub(w94[1], w96[1]) };         \
+    const T_VEC w100[2] = { add(w95[0], w97[1]), sub(w95[1], w97[0]) };        \
+    const T_VEC w101[2] = { sub(w95[0], w97[1]), add(w95[1], w97[0]) };        \
+    const T_VEC w102[2] = { add(i5, i11), sub(i27, i21) };                     \
+    const T_VEC w103[2] = { sub(i5, i11), sub(sub(kWeight0, i21), i27) };      \
+    const T_VEC w104[2] = { add(i13, i3), sub(i19, i29) };                     \
+    const T_VEC w105[2] = { sub(i13, i3), sub(sub(kWeight0, i29), i19) };      \
+    const T_VEC w106[2] = { add(w102[0], w104[0]), add(w102[1], w104[1]) };    \
+    const T_VEC w107[2] = { sub(w102[0], w104[0]), sub(w102[1], w104[1]) };    \
+    const T_VEC w108[2] = { add(w103[0], w105[1]), sub(w103[1], w105[0]) };    \
+    const T_VEC w109[2] = { sub(w103[0], w105[1]), add(w103[1], w105[0]) };    \
+    const T_VEC w110[2] = { add(w98[0], w106[0]), add(w98[1], w106[1]) };      \
+    const T_VEC w111[2] = { sub(w98[0], w106[0]), sub(w98[1], w106[1]) };      \
+    const T_VEC w112[2] = {                                                    \
+      add(w100[0], mul(kWeight2, add(w108[0], w108[1]))),                      \
+      add(w100[1], mul(kWeight2, sub(w108[1], w108[0])))                       \
+    };                                                                         \
+    const T_VEC w113[2] = {                                                    \
+      add(w100[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w108[0])), mul(kWeight2, w108[1]))), \
+      add(w100[1], mul(kWeight2, sub(w108[0], w108[1])))                       \
+    };                                                                         \
+    const T_VEC w114[2] = { add(w99[0], w107[1]), sub(w99[1], w107[0]) };      \
+    const T_VEC w115[2] = { sub(w99[0], w107[1]), add(w99[1], w107[0]) };      \
+    const T_VEC w116[2] = {                                                    \
+      sub(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      sub(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w117[2] = {                                                    \
+      add(w101[0], mul(kWeight2, sub(w109[0], w109[1]))),                      \
+      add(w101[1], mul(kWeight2, add(w109[1], w109[0])))                       \
+    };                                                                         \
+    const T_VEC w118[2] = { add(i3, i13), sub(i29, i19) };                     \
+    const T_VEC w119[2] = { sub(i3, i13), sub(sub(kWeight0, i19), i29) };      \
+    const T_VEC w120[2] = { add(i11, i5), sub(i21, i27) };                     \
+    const T_VEC w121[2] = { sub(i11, i5), sub(sub(kWeight0, i27), i21) };      \
+    const T_VEC w122[2] = { add(w118[0], w120[0]), add(w118[1], w120[1]) };    \
+    const T_VEC w123[2] = { sub(w118[0], w120[0]), sub(w118[1], w120[1]) };    \
+    const T_VEC w124[2] = { add(w119[0], w121[1]), sub(w119[1], w121[0]) };    \
+    const T_VEC w125[2] = { sub(w119[0], w121[1]), add(w119[1], w121[0]) };    \
+    const T_VEC w126[2] = { add(i7, i9), sub(i25, i23) };                      \
+    const T_VEC w127[2] = { sub(i7, i9), sub(sub(kWeight0, i23), i25) };       \
+    const T_VEC w128[2] = { add(i15, i1), sub(i17, i31) };                     \
+    const T_VEC w129[2] = { sub(i15, i1), sub(sub(kWeight0, i31), i17) };      \
+    const T_VEC w130[2] = { add(w126[0], w128[0]), add(w126[1], w128[1]) };    \
+    const T_VEC w131[2] = { sub(w126[0], w128[0]), sub(w126[1], w128[1]) };    \
+    const T_VEC w132[2] = { add(w127[0], w129[1]), sub(w127[1], w129[0]) };    \
+    const T_VEC w133[2] = { sub(w127[0], w129[1]), add(w127[1], w129[0]) };    \
+    const T_VEC w134[2] = { add(w122[0], w130[0]), add(w122[1], w130[1]) };    \
+    const T_VEC w135[2] = { sub(w122[0], w130[0]), sub(w122[1], w130[1]) };    \
+    const T_VEC w136[2] = {                                                    \
+      add(w124[0], mul(kWeight2, add(w132[0], w132[1]))),                      \
+      add(w124[1], mul(kWeight2, sub(w132[1], w132[0])))                       \
+    };                                                                         \
+    const T_VEC w137[2] = {                                                    \
+      add(w124[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w132[0])), mul(kWeight2, w132[1]))), \
+      add(w124[1], mul(kWeight2, sub(w132[0], w132[1])))                       \
+    };                                                                         \
+    const T_VEC w138[2] = { add(w123[0], w131[1]), sub(w123[1], w131[0]) };    \
+    const T_VEC w139[2] = { sub(w123[0], w131[1]), add(w123[1], w131[0]) };    \
+    const T_VEC w140[2] = {                                                    \
+      sub(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      sub(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w141[2] = {                                                    \
+      add(w125[0], mul(kWeight2, sub(w133[0], w133[1]))),                      \
+      add(w125[1], mul(kWeight2, add(w133[1], w133[0])))                       \
+    };                                                                         \
+    const T_VEC w142[2] = { add(w110[0], w134[0]), add(w110[1], w134[1]) };    \
+    const T_VEC w143[2] = { sub(w110[0], w134[0]), sub(w110[1], w134[1]) };    \
+    const T_VEC w144[2] = {                                                    \
+      add(w112[0], add(mul(kWeight3, w136[0]), mul(kWeight4, w136[1]))),       \
+      add(w112[1], sub(mul(kWeight3, w136[1]), mul(kWeight4, w136[0])))        \
+    };                                                                         \
+    const T_VEC w145[2] = {                                                    \
+      add(w112[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w136[0])), mul(kWeight4, w136[1]))), \
+      add(w112[1], sub(mul(kWeight4, w136[0]), mul(kWeight3, w136[1])))        \
+    };                                                                         \
+    const T_VEC w146[2] = {                                                    \
+      add(w114[0], mul(kWeight2, add(w138[0], w138[1]))),                      \
+      add(w114[1], mul(kWeight2, sub(w138[1], w138[0])))                       \
+    };                                                                         \
+    const T_VEC w147[2] = {                                                    \
+      add(w114[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight2, w138[0])), mul(kWeight2, w138[1]))), \
+      add(w114[1], mul(kWeight2, sub(w138[0], w138[1])))                       \
+    };                                                                         \
+    const T_VEC w148[2] = {                                                    \
+      add(w116[0], add(mul(kWeight4, w140[0]), mul(kWeight3, w140[1]))),       \
+      add(w116[1], sub(mul(kWeight4, w140[1]), mul(kWeight3, w140[0])))        \
+    };                                                                         \
+    const T_VEC w149[2] = {                                                    \
+      add(w116[0],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w140[0])), mul(kWeight3, w140[1]))), \
+      add(w116[1], sub(mul(kWeight3, w140[0]), mul(kWeight4, w140[1])))        \
+    };                                                                         \
+    const T_VEC w150[2] = { add(w111[0], w135[1]), sub(w111[1], w135[0]) };    \
+    const T_VEC w151[2] = { sub(w111[0], w135[1]), add(w111[1], w135[0]) };    \
+    const T_VEC w152[2] = {                                                    \
+      sub(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight4, w137[1])), mul(kWeight3, w137[0])))  \
+    };                                                                         \
+    const T_VEC w153[2] = {                                                    \
+      add(w113[0], sub(mul(kWeight4, w137[0]), mul(kWeight3, w137[1]))),       \
+      add(w113[1], add(mul(kWeight4, w137[1]), mul(kWeight3, w137[0])))        \
+    };                                                                         \
+    const T_VEC w154[2] = {                                                    \
+      sub(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      sub(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w155[2] = {                                                    \
+      add(w115[0], mul(kWeight2, sub(w139[0], w139[1]))),                      \
+      add(w115[1], mul(kWeight2, add(w139[1], w139[0])))                       \
+    };                                                                         \
+    const T_VEC w156[2] = {                                                    \
+      sub(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1],                                                             \
+          sub(sub(kWeight0, mul(kWeight3, w141[1])), mul(kWeight4, w141[0])))  \
+    };                                                                         \
+    const T_VEC w157[2] = {                                                    \
+      add(w117[0], sub(mul(kWeight3, w141[0]), mul(kWeight4, w141[1]))),       \
+      add(w117[1], add(mul(kWeight3, w141[1]), mul(kWeight4, w141[0])))        \
+    };                                                                         \
+    store(output + 0 * stride, add(w78[0], w142[0]));                          \
+    store(output + 1 * stride,                                                 \
+          add(w80[0], add(mul(kWeight5, w144[0]), mul(kWeight6, w144[1]))));   \
+    store(output + 2 * stride,                                                 \
+          add(w82[0], add(mul(kWeight3, w146[0]), mul(kWeight4, w146[1]))));   \
+    store(output + 3 * stride,                                                 \
+          add(w84[0], add(mul(kWeight7, w148[0]), mul(kWeight8, w148[1]))));   \
+    store(output + 4 * stride,                                                 \
+          add(w86[0], mul(kWeight2, add(w150[0], w150[1]))));                  \
+    store(output + 5 * stride,                                                 \
+          add(w88[0], add(mul(kWeight8, w152[0]), mul(kWeight7, w152[1]))));   \
+    store(output + 6 * stride,                                                 \
+          add(w90[0], add(mul(kWeight4, w154[0]), mul(kWeight3, w154[1]))));   \
+    store(output + 7 * stride,                                                 \
+          add(w92[0], add(mul(kWeight6, w156[0]), mul(kWeight5, w156[1]))));   \
+    store(output + 8 * stride, add(w79[0], w143[1]));                          \
+    store(output + 9 * stride,                                                 \
+          sub(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 10 * stride,                                                \
+          sub(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 11 * stride,                                                \
+          sub(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 12 * stride,                                                \
+          sub(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 13 * stride,                                                \
+          sub(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 14 * stride,                                                \
+          sub(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 15 * stride,                                                \
+          sub(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+    store(output + 16 * stride, sub(w78[0], w142[0]));                         \
+    store(output + 17 * stride,                                                \
+          add(w80[0], sub(sub(kWeight0, mul(kWeight5, w144[0])),               \
+                          mul(kWeight6, w144[1]))));                           \
+    store(output + 18 * stride,                                                \
+          add(w82[0], sub(sub(kWeight0, mul(kWeight3, w146[0])),               \
+                          mul(kWeight4, w146[1]))));                           \
+    store(output + 19 * stride,                                                \
+          add(w84[0], sub(sub(kWeight0, mul(kWeight7, w148[0])),               \
+                          mul(kWeight8, w148[1]))));                           \
+    store(output + 20 * stride,                                                \
+          add(w86[0], sub(sub(kWeight0, mul(kWeight2, w150[0])),               \
+                          mul(kWeight2, w150[1]))));                           \
+    store(output + 21 * stride,                                                \
+          add(w88[0], sub(sub(kWeight0, mul(kWeight8, w152[0])),               \
+                          mul(kWeight7, w152[1]))));                           \
+    store(output + 22 * stride,                                                \
+          add(w90[0], sub(sub(kWeight0, mul(kWeight4, w154[0])),               \
+                          mul(kWeight3, w154[1]))));                           \
+    store(output + 23 * stride,                                                \
+          add(w92[0], sub(sub(kWeight0, mul(kWeight6, w156[0])),               \
+                          mul(kWeight5, w156[1]))));                           \
+    store(output + 24 * stride, sub(w79[0], w143[1]));                         \
+    store(output + 25 * stride,                                                \
+          add(w81[0], sub(mul(kWeight6, w145[0]), mul(kWeight5, w145[1]))));   \
+    store(output + 26 * stride,                                                \
+          add(w83[0], sub(mul(kWeight4, w147[0]), mul(kWeight3, w147[1]))));   \
+    store(output + 27 * stride,                                                \
+          add(w85[0], sub(mul(kWeight8, w149[0]), mul(kWeight7, w149[1]))));   \
+    store(output + 28 * stride,                                                \
+          add(w87[0], mul(kWeight2, sub(w151[0], w151[1]))));                  \
+    store(output + 29 * stride,                                                \
+          add(w89[0], sub(mul(kWeight7, w153[0]), mul(kWeight8, w153[1]))));   \
+    store(output + 30 * stride,                                                \
+          add(w91[0], sub(mul(kWeight3, w155[0]), mul(kWeight4, w155[1]))));   \
+    store(output + 31 * stride,                                                \
+          add(w93[0], sub(mul(kWeight5, w157[0]), mul(kWeight6, w157[1]))));   \
+  }
+
+#endif  // AOM_AOM_DSP_FFT_COMMON_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
new file mode 100644
index 0000000000..ee42be7393
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/arm/disflow_neon.c
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) 2023, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/flow_estimation/disflow.h"
+
+#include <arm_neon.h>
+#include <math.h>
+
+#include "aom_dsp/arm/mem_neon.h"
+#include "aom_dsp/arm/sum_neon.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+  // Check that the fractional position is in range.
+  //
+  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+  // possible to have x == 1 due to floating point rounding. This is fine,
+  // and we still interpolate correctly if we allow x = 1.
+  assert(0 <= x && x <= 1);
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+  kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+  double kernel_dbl[4];
+  get_cubic_kernel_dbl(x, kernel_dbl);
+
+  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_error(const uint8_t *src, const uint8_t *ref,
+                                      int width, int height, int stride, int x,
+                                      int y, double u, double v, int16_t *dt) {
+  // Split offset into integer and fractional parts, and compute cubic
+  // interpolation kernels
+  const int u_int = (int)floor(u);
+  const int v_int = (int)floor(v);
+  const double u_frac = u - floor(u);
+  const double v_frac = v - floor(v);
+
+  int h_kernel[4];
+  int v_kernel[4];
+  get_cubic_kernel_int(u_frac, h_kernel);
+  get_cubic_kernel_int(v_frac, v_kernel);
+
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+
+  // Clamp coordinates so that all pixels we fetch will remain within the
+  // allocated border region, but allow them to go far enough out that
+  // the border pixels' values do not change.
+  // Since we are calculating an 8x8 block, the bottom-right pixel
+  // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+  // interpolation has 4 taps, meaning that the output of pixel
+  // (x_w, y_w) depends on the pixels in the range
+  // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+  //
+  // Thus the most extreme coordinates which will be fetched are
+  // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+  const int x0 = clamp(x + u_int, -9, width);
+  const int y0 = clamp(y + v_int, -9, height);
+
+  // Horizontal convolution.
+  const uint8_t *ref_start = ref + (y0 - 1) * stride + (x0 - 1);
+  int16x4_t h_filter = vmovn_s32(vld1q_s32(h_kernel));
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 3; ++i) {
+    uint8x16_t r = vld1q_u8(ref_start + i * stride);
+    uint16x8_t r0 = vmovl_u8(vget_low_u8(r));
+    uint16x8_t r1 = vmovl_u8(vget_high_u8(r));
+
+    int16x8_t s0 = vreinterpretq_s16_u16(r0);
+    int16x8_t s1 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 1));
+    int16x8_t s2 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 2));
+    int16x8_t s3 = vreinterpretq_s16_u16(vextq_u16(r0, r1, 3));
+
+    int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(s0), h_filter, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s1), h_filter, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s2), h_filter, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(s3), h_filter, 3);
+
+    int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(s0), h_filter, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s1), h_filter, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s2), h_filter, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(s3), h_filter, 3);
+
+    // 6 is the maximum allowable number of extra bits which will avoid
+    // the intermediate values overflowing an int16_t. The most extreme
+    // intermediate value occurs when:
+    // * The input pixels are [0, 255, 255, 0]
+    // * u_frac = 0.5
+    // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+    // As an integer with 6 fractional bits, that is 18360, which fits
+    // in an int16_t. But with 7 fractional bits it would be 36720,
+    // which is too large.
+
+    int16x8_t sum = vcombine_s16(vrshrn_n_s32(sum_lo, DISFLOW_INTERP_BITS - 6),
+                                 vrshrn_n_s32(sum_hi, DISFLOW_INTERP_BITS - 6));
+    vst1q_s16(tmp_ + i * DISFLOW_PATCH_SIZE, sum);
+  }
+
+  // Vertical convolution.
+  int16x4_t v_filter = vmovn_s32(vld1q_s32(v_kernel));
+  int16_t *tmp_start = tmp_ + DISFLOW_PATCH_SIZE;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+    int16x8_t t0 = vld1q_s16(tmp_start + (i - 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t1 = vld1q_s16(tmp_start + i * DISFLOW_PATCH_SIZE);
+    int16x8_t t2 = vld1q_s16(tmp_start + (i + 1) * DISFLOW_PATCH_SIZE);
+    int16x8_t t3 = vld1q_s16(tmp_start + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int32x4_t sum_lo = vmull_lane_s16(vget_low_s16(t0), v_filter, 0);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t1), v_filter, 1);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t2), v_filter, 2);
+    sum_lo = vmlal_lane_s16(sum_lo, vget_low_s16(t3), v_filter, 3);
+
+    int32x4_t sum_hi = vmull_lane_s16(vget_high_s16(t0), v_filter, 0);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t1), v_filter, 1);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t2), v_filter, 2);
+    sum_hi = vmlal_lane_s16(sum_hi, vget_high_s16(t3), v_filter, 3);
+
+    uint8x8_t s = vld1_u8(src + (i + y) * stride + x);
+    int16x8_t s_s16 = vreinterpretq_s16_u16(vshll_n_u8(s, 3));
+
+    // This time, we have to round off the 6 extra bits which were kept
+    // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+    // of precision to match the scale of the dx and dy arrays.
+    sum_lo = vrshrq_n_s32(sum_lo,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    sum_hi = vrshrq_n_s32(sum_hi,
+                          DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2);
+    int32x4_t err_lo = vsubw_s16(sum_lo, vget_low_s16(s_s16));
+    int32x4_t err_hi = vsubw_s16(sum_hi, vget_high_s16(s_s16));
+    vst1q_s16(dt + i * DISFLOW_PATCH_SIZE,
+              vcombine_s16(vmovn_s32(err_lo), vmovn_s32(err_hi)));
+  }
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 0, -1}.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(s0, s2));
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, diff);
+  }
+
+  // Vertical filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution, one of
+  // which can be reused for the next line.
+  int16x8_t s0 = vld1q_s16(tmp);
+  int16x8_t s1 = vld1q_s16(tmp + DISFLOW_PATCH_SIZE);
+  int16x8_t sum01 = vaddq_s16(s0, s1);
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t s2 = vld1q_s16(tmp + (i + 2) * DISFLOW_PATCH_SIZE);
+
+    int16x8_t sum12 = vaddq_s16(s1, s2);
+    int16x8_t sum = vaddq_s16(sum01, sum12);
+
+    vst1q_s16(dst + i * dst_stride, sum);
+
+    sum01 = sum12;
+    s1 = s2;
+  }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+
+  // Horizontal filter, using kernel {1, 2, 1}.
+  // This kernel can be split into two 2-taps kernels of value {1, 1}.
+  // That way we need only 3 add operations to perform the convolution.
+  const uint8_t *src_start = src - 1 * src_stride - 1;
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE + 2; i++) {
+    uint8x16_t s = vld1q_u8(src_start + i * src_stride);
+    uint8x8_t s0 = vget_low_u8(s);
+    uint8x8_t s1 = vget_low_u8(vextq_u8(s, s, 1));
+    uint8x8_t s2 = vget_low_u8(vextq_u8(s, s, 2));
+
+    uint16x8_t sum01 = vaddl_u8(s0, s1);
+    uint16x8_t sum12 = vaddl_u8(s1, s2);
+    uint16x8_t sum = vaddq_u16(sum01, sum12);
+
+    vst1q_s16(tmp + i * DISFLOW_PATCH_SIZE, vreinterpretq_s16_u16(sum));
+  }
+
+  // Vertical filter, using kernel {1, 0, -1}.
+  // Load the whole block at once to avoid redundant loads during convolution.
+  int16x8_t t[10];
+  load_s16_8x10(tmp, DISFLOW_PATCH_SIZE, &t[0], &t[1], &t[2], &t[3], &t[4],
+                &t[5], &t[6], &t[7], &t[8], &t[9]);
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    // Given that the kernel is {1, 0, -1} the convolution is a simple
+    // subtraction.
+    int16x8_t diff = vsubq_s16(t[i], t[i + 2]);
+
+    vst1q_s16(dst + i * dst_stride, diff);
+  }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+//    (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+//   u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+//   M = |sum(dx * dx)  sum(dx * dy)|
+//       |sum(dx * dy)  sum(dy * dy)|
+//
+//   b = |sum(dx * dt)|
+//       |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       double *M_inv) {
+  int32x4_t sum[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0),
+                       vdupq_n_s32(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t x = vld1q_s16(dx + i * dx_stride);
+    int16x8_t y = vld1q_s16(dy + i * dy_stride);
+    sum[0] = vmlal_s16(sum[0], vget_low_s16(x), vget_low_s16(x));
+    sum[0] = vmlal_s16(sum[0], vget_high_s16(x), vget_high_s16(x));
+
+    sum[1] = vmlal_s16(sum[1], vget_low_s16(x), vget_low_s16(y));
+    sum[1] = vmlal_s16(sum[1], vget_high_s16(x), vget_high_s16(y));
+
+    sum[3] = vmlal_s16(sum[3], vget_low_s16(y), vget_low_s16(y));
+    sum[3] = vmlal_s16(sum[3], vget_high_s16(y), vget_high_s16(y));
+  }
+  sum[2] = sum[1];
+
+  int32x4_t res = horizontal_add_4d_s32x4(sum);
+
+  // Apply regularization
+  // We follow the standard regularization method of adding `k * I` before
+  // inverting. This ensures that the matrix will be invertible.
+  //
+  // Setting the regularization strength k to 1 seems to work well here, as
+  // typical values coming from the other equations are very large (1e5 to
+  // 1e6, with an upper limit of around 6e7, at the time of writing).
+  // It also preserves the property that all matrix values are whole numbers,
+  // which is convenient for integerized SIMD implementation.
+
+  double M0 = (double)vgetq_lane_s32(res, 0) + 1;
+  double M1 = (double)vgetq_lane_s32(res, 1);
+  double M2 = (double)vgetq_lane_s32(res, 2);
+  double M3 = (double)vgetq_lane_s32(res, 3) + 1;
+
+  // Invert matrix M.
+  double det = (M0 * M3) - (M1 * M2);
+  assert(det >= 1);
+  const double det_inv = 1 / det;
+
+  M_inv[0] = M3 * det_inv;
+  M_inv[1] = -M1 * det_inv;
+  M_inv[2] = -M2 * det_inv;
+  M_inv[3] = M0 * det_inv;
+}
+
+static INLINE void compute_flow_vector(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       const int16_t *dt, int dt_stride,
+                                       int *b) {
+  int32x4_t b_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    int16x8_t dx16 = vld1q_s16(dx + i * dx_stride);
+    int16x8_t dy16 = vld1q_s16(dy + i * dy_stride);
+    int16x8_t dt16 = vld1q_s16(dt + i * dt_stride);
+
+    b_s32[0] = vmlal_s16(b_s32[0], vget_low_s16(dx16), vget_low_s16(dt16));
+    b_s32[0] = vmlal_s16(b_s32[0], vget_high_s16(dx16), vget_high_s16(dt16));
+
+    b_s32[1] = vmlal_s16(b_s32[1], vget_low_s16(dy16), vget_low_s16(dt16));
+    b_s32[1] = vmlal_s16(b_s32[1], vget_high_s16(dy16), vget_high_s16(dt16));
+  }
+
+  int32x4_t b_red = horizontal_add_2d_s32(b_s32[0], b_s32[1]);
+  vst1_s32(b, add_pairwise_s32x4(b_red));
+}
+
+void aom_compute_flow_at_point_neon(const uint8_t *src, const uint8_t *ref,
+                                    int x, int y, int width, int height,
+                                    int stride, double *u, double *v) {
+  double M_inv[4];
+  int b[2];
+  int16_t dt[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+  // Compute gradients within this patch
+  const uint8_t *src_patch = &src[y * stride + x];
+  sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+  sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+  compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M_inv);
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    compute_flow_error(src, ref, width, height, stride, x, y, *u, *v, dt);
+    compute_flow_vector(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, dt,
+                        DISFLOW_PATCH_SIZE, b);
+
+    // Solve flow equations to find a better estimate for the flow vector
+    // at this point
+    const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+    const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+    *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+    *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+    if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+      // Stop iteration when we're close to convergence
+      break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.c b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c
new file mode 100644
index 0000000000..284d1bd7b8
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "third_party/fastfeat/fast.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_mem/aom_mem.h"
+#include "av1/common/common.h"
+
+#define FAST_BARRIER 18
+
+size_t av1_get_corner_list_size(void) { return sizeof(CornerList); }
+
+CornerList *av1_alloc_corner_list(void) {
+  CornerList *corners = (CornerList *)aom_calloc(1, sizeof(*corners));
+  if (!corners) {
+    return NULL;
+  }
+
+  corners->valid = false;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_init(&corners->mutex, NULL);
+#endif  // CONFIG_MULTITHREAD
+  return corners;
+}
+
+static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+  const uint8_t *buf = pyr->layers[0].buffer;
+  int width = pyr->layers[0].width;
+  int height = pyr->layers[0].height;
+  int stride = pyr->layers[0].stride;
+
+  int *scores = NULL;
+  int num_corners;
+  xy *const frame_corners_xy = aom_fast9_detect_nonmax(
+      buf, width, height, stride, FAST_BARRIER, &scores, &num_corners);
+  if (num_corners < 0) return false;
+
+  if (num_corners <= MAX_CORNERS) {
+    // Use all detected corners
+    if (num_corners != 0) {
+      memcpy(corners->corners, frame_corners_xy,
+             sizeof(*frame_corners_xy) * num_corners);
+    }
+    corners->num_corners = num_corners;
+  } else {
+    // There are more than MAX_CORNERS corners avilable, so pick out a subset
+    // of the sharpest corners, as these will be the most useful for flow
+    // estimation
+    int histogram[256];
+    av1_zero(histogram);
+    for (int i = 0; i < num_corners; i++) {
+      assert(FAST_BARRIER <= scores[i] && scores[i] <= 255);
+      histogram[scores[i]] += 1;
+    }
+
+    int threshold = -1;
+    int found_corners = 0;
+    for (int bucket = 255; bucket >= 0; bucket--) {
+      if (found_corners + histogram[bucket] > MAX_CORNERS) {
+        // Set threshold here
+        threshold = bucket;
+        break;
+      }
+      found_corners += histogram[bucket];
+    }
+    assert(threshold != -1 && "Failed to select a valid threshold");
+
+    int copied_corners = 0;
+    for (int i = 0; i < num_corners; i++) {
+      if (scores[i] > threshold) {
+        assert(copied_corners < MAX_CORNERS);
+        corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x;
+        corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y;
+        copied_corners += 1;
+      }
+    }
+    assert(copied_corners == found_corners);
+    corners->num_corners = copied_corners;
+  }
+
+  free(scores);
+  free(frame_corners_xy);
+  return true;
+}
+
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) {
+  assert(corners);
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  if (!corners->valid) {
+    corners->valid = compute_corner_list(pyr, corners);
+  }
+  bool valid = corners->valid;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+  return valid;
+}
+
+#ifndef NDEBUG
+// Check if a corner list has already been computed.
+// This is mostly a debug helper - as it is necessary to hold corners->mutex
+// while reading the valid flag, we cannot just write:
+//   assert(corners->valid);
+// This function allows the check to be correctly written as:
+//   assert(aom_is_corner_list_valid(corners));
+bool aom_is_corner_list_valid(CornerList *corners) {
+  assert(corners);
+
+  // Per the comments in the CornerList struct, we must take this mutex
+  // before reading or writing the "valid" flag, and hold it while computing
+  // the pyramid, to ensure proper behaviour if multiple threads call this
+  // function simultaneously
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  bool valid = corners->valid;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  return valid;
+}
+#endif
+
+void av1_invalidate_corner_list(CornerList *corners) {
+  if (corners) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+    corners->valid = false;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+  }
+}
+
+void av1_free_corner_list(CornerList *corners) {
+  if (corners) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&corners->mutex);
+#endif  // CONFIG_MULTITHREAD
+    aom_free(corners);
+  }
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_detect.h b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h
new file mode 100644
index 0000000000..d05846ce5d
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_detect.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <memory.h>
+
+#include "aom_dsp/pyramid.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_CORNERS 4096
+
+typedef struct corner_list {
+#if CONFIG_MULTITHREAD
+  // Mutex which is used to prevent the corner list from being computed twice
+  // at the same time
+  //
+  // Semantics:
+  // * This mutex must be held whenever reading or writing the `valid` flag
+  //
+  // * This mutex must also be held while computing the image pyramid,
+  //   to ensure that only one thread may do so at a time.
+  //
+  // * However, once you have read the valid flag and seen a true value,
+  //   it is safe to drop the mutex and read from the remaining fields.
+  //   This is because, once the image pyramid is computed, its contents
+  //   will not be changed until the parent frame buffer is recycled,
+  //   which will not happen until there are no more outstanding references
+  //   to the frame buffer.
+  pthread_mutex_t mutex;
+#endif  // CONFIG_MULTITHREAD
+  // Flag indicating whether the corner list contains valid data
+  bool valid;
+  // Number of corners found
+  int num_corners;
+  // (x, y) coordinates of each corner
+  int corners[2 * MAX_CORNERS];
+} CornerList;
+
+size_t av1_get_corner_list_size(void);
+
+CornerList *av1_alloc_corner_list(void);
+
+bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners);
+
+#ifndef NDEBUG
+// Check if a corner list has already been computed.
+// This is mostly a debug helper - as it is necessary to hold corners->mutex
+// while reading the valid flag, we cannot just write:
+//   assert(corners->valid);
+// This function allows the check to be correctly written as:
+//   assert(aom_is_corner_list_valid(corners));
+bool aom_is_corner_list_valid(CornerList *corners);
+#endif
+
+void av1_invalidate_corner_list(CornerList *corners);
+
+void av1_free_corner_list(CornerList *corners);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_DETECT_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.c b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
new file mode 100644
index 0000000000..cef719b68d
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.c
@@ -0,0 +1,259 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/pyramid.h"
+#include "aom_scale/yv12config.h"
+
+#define SEARCH_SZ 9
+#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2)
+
+#define THRESHOLD_NCC 0.75
+
+/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame,
+   centered at (x, y).
+*/
+static double compute_variance(const unsigned char *frame, int stride, int x,
+                               int y) {
+  int sum = 0;
+  int sumsq = 0;
+  int var;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+      sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] *
+               frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)];
+    }
+  var = sumsq * MATCH_SZ_SQ - sum * sum;
+  return (double)var;
+}
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1,
+                                       int x1, int y1,
+                                       const unsigned char *frame2, int stride2,
+                                       int x2, int y2) {
+  int v1, v2;
+  int sum1 = 0;
+  int sum2 = 0;
+  int sumsq2 = 0;
+  int cross = 0;
+  int var2, cov;
+  int i, j;
+  for (i = 0; i < MATCH_SZ; ++i)
+    for (j = 0; j < MATCH_SZ; ++j) {
+      v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)];
+      v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)];
+      sum1 += v1;
+      sum2 += v2;
+      sumsq2 += v2 * v2;
+      cross += v1 * v2;
+    }
+  var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
+
+static int is_eligible_point(int pointx, int pointy, int width, int height) {
+  return (pointx >= MATCH_SZ_BY2 && pointy >= MATCH_SZ_BY2 &&
+          pointx + MATCH_SZ_BY2 < width && pointy + MATCH_SZ_BY2 < height);
+}
+
+static int is_eligible_distance(int point1x, int point1y, int point2x,
+                                int point2y, int width, int height) {
+  const int thresh = (width < height ? height : width) >> 4;
+  return ((point1x - point2x) * (point1x - point2x) +
+          (point1y - point2y) * (point1y - point2y)) <= thresh * thresh;
+}
+
+static void improve_correspondence(const unsigned char *src,
+                                   const unsigned char *ref, int width,
+                                   int height, int src_stride, int ref_stride,
+                                   Correspondence *correspondences,
+                                   int num_correspondences) {
+  int i;
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    // For this algorithm, all points have integer coordinates.
+    // It's a little more efficient to convert them to ints once,
+    // before the inner loops
+    int x0 = (int)correspondences[i].x;
+    int y0 = (int)correspondences[i].y;
+    int rx0 = (int)correspondences[i].rx;
+    int ry0 = (int)correspondences[i].ry;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) {
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue;
+        if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height))
+          continue;
+        match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref,
+                                                  ref_stride, rx0 + x, ry0 + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    }
+    correspondences[i].rx += best_x;
+    correspondences[i].ry += best_y;
+  }
+  for (i = 0; i < num_correspondences; ++i) {
+    int x, y, best_x = 0, best_y = 0;
+    double best_match_ncc = 0.0;
+    int x0 = (int)correspondences[i].x;
+    int y0 = (int)correspondences[i].y;
+    int rx0 = (int)correspondences[i].rx;
+    int ry0 = (int)correspondences[i].ry;
+    for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y)
+      for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) {
+        double match_ncc;
+        if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue;
+        if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height))
+          continue;
+        match_ncc = av1_compute_cross_correlation(
+            ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y);
+        if (match_ncc > best_match_ncc) {
+          best_match_ncc = match_ncc;
+          best_y = y;
+          best_x = x;
+        }
+      }
+    correspondences[i].x += best_x;
+    correspondences[i].y += best_y;
+  }
+}
+
+static int determine_correspondence(const unsigned char *src,
+                                    const int *src_corners, int num_src_corners,
+                                    const unsigned char *ref,
+                                    const int *ref_corners, int num_ref_corners,
+                                    int width, int height, int src_stride,
+                                    int ref_stride,
+                                    Correspondence *correspondences) {
+  // TODO(sarahparker) Improve this to include 2-way match
+  int i, j;
+  int num_correspondences = 0;
+  for (i = 0; i < num_src_corners; ++i) {
+    double best_match_ncc = 0.0;
+    double template_norm;
+    int best_match_j = -1;
+    if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width,
+                           height))
+      continue;
+    for (j = 0; j < num_ref_corners; ++j) {
+      double match_ncc;
+      if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width,
+                             height))
+        continue;
+      if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1],
+                                ref_corners[2 * j], ref_corners[2 * j + 1],
+                                width, height))
+        continue;
+      match_ncc = av1_compute_cross_correlation(
+          src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref,
+          ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]);
+      if (match_ncc > best_match_ncc) {
+        best_match_ncc = match_ncc;
+        best_match_j = j;
+      }
+    }
+    // Note: We want to test if the best correlation is >= THRESHOLD_NCC,
+    // but need to account for the normalization in
+    // av1_compute_cross_correlation.
+    template_norm = compute_variance(src, src_stride, src_corners[2 * i],
+                                     src_corners[2 * i + 1]);
+    if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) {
+      correspondences[num_correspondences].x = src_corners[2 * i];
+      correspondences[num_correspondences].y = src_corners[2 * i + 1];
+      correspondences[num_correspondences].rx = ref_corners[2 * best_match_j];
+      correspondences[num_correspondences].ry =
+          ref_corners[2 * best_match_j + 1];
+      num_correspondences++;
+    }
+  }
+  improve_correspondence(src, ref, width, height, src_stride, ref_stride,
+                         correspondences, num_correspondences);
+  return num_correspondences;
+}
+
+bool av1_compute_global_motion_feature_match(
+    TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
+    int bit_depth, MotionModel *motion_models, int num_motion_models,
+    bool *mem_alloc_failed) {
+  int num_correspondences;
+  Correspondence *correspondences;
+  ImagePyramid *src_pyramid = src->y_pyramid;
+  CornerList *src_corners = src->corners;
+  ImagePyramid *ref_pyramid = ref->y_pyramid;
+  CornerList *ref_corners = ref->corners;
+
+  // Precompute information we will need about each frame
+  if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+
+  const uint8_t *src_buffer = src_pyramid->layers[0].buffer;
+  const int src_width = src_pyramid->layers[0].width;
+  const int src_height = src_pyramid->layers[0].height;
+  const int src_stride = src_pyramid->layers[0].stride;
+
+  const uint8_t *ref_buffer = ref_pyramid->layers[0].buffer;
+  assert(ref_pyramid->layers[0].width == src_width);
+  assert(ref_pyramid->layers[0].height == src_height);
+  const int ref_stride = ref_pyramid->layers[0].stride;
+
+  // find correspondences between the two images
+  correspondences = (Correspondence *)aom_malloc(src_corners->num_corners *
+                                                 sizeof(*correspondences));
+  if (!correspondences) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  num_correspondences = determine_correspondence(
+      src_buffer, src_corners->corners, src_corners->num_corners, ref_buffer,
+      ref_corners->corners, ref_corners->num_corners, src_width, src_height,
+      src_stride, ref_stride, correspondences);
+
+  bool result = ransac(correspondences, num_correspondences, type,
+                       motion_models, num_motion_models, mem_alloc_failed);
+
+  aom_free(correspondences);
+  return result;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/corner_match.h b/third_party/aom/aom_dsp/flow_estimation/corner_match.h
new file mode 100644
index 0000000000..4435d2c767
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/corner_match.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
+
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MATCH_SZ 13
+#define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2)
+#define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ)
+
+bool av1_compute_global_motion_feature_match(
+    TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref,
+    int bit_depth, MotionModel *motion_models, int num_motion_models,
+    bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_CORNER_MATCH_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.c b/third_party/aom/aom_dsp/flow_estimation/disflow.c
new file mode 100644
index 0000000000..147a8ab3b3
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/disflow.c
@@ -0,0 +1,823 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// Dense Inverse Search flow algorithm
+// Paper: https://arxiv.org/abs/1603.03590
+
+#include <assert.h>
+#include <math.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/pyramid.h"
+#include "aom_mem/aom_mem.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Amount to downsample the flow field by.
+// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate
+// one flow point for each 4x4 pixel region of the frame
+// Must be a power of 2
+#define DOWNSAMPLE_SHIFT 3
+#define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT)
+
+// Filters used when upscaling the flow field from one pyramid level
+// to another. See upscale_flow_component for details on kernel selection
+#define FLOW_UPSCALE_TAPS 4
+
+// Number of outermost flow field entries (on each edge) which can't be
+// computed, because the patch they correspond to extends outside of the
+// frame
+// The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is
+// (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries
+#define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT)
+
+// Number of extra padding entries on each side of the flow field.
+// These samples are added so that we do not need to apply clamping when
+// interpolating or upsampling the flow field
+#define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2)
+
+// When downsampling the flow field, each flow field entry covers a square
+// region of pixels in the image pyramid. This value is equal to the position
+// of the center of that region, as an offset from the top/left edge.
+//
+// Note: Using ((DOWNSAMPLE_FACTOR - 1) / 2) is equivalent to the more
+// natural expression ((DOWNSAMPLE_FACTOR / 2) - 1),
+// unless DOWNSAMPLE_FACTOR == 1 (ie, no downsampling), in which case
+// this gives the correct offset of 0 instead of -1.
+#define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2)
+
+static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = {
+  // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively
+  { -3 / 128., 29 / 128., 111 / 128., -9 / 128. },
+  { -9 / 128., 111 / 128., 29 / 128., -3 / 128. }
+};
+
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+  // Check that the fractional position is in range.
+  //
+  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+  // possible to have x == 1 due to floating point rounding. This is fine,
+  // and we still interpolate correctly if we allow x = 1.
+  assert(0 <= x && x <= 1);
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+  kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int kernel[4]) {
+  double kernel_dbl[4];
+  get_cubic_kernel_dbl(x, kernel_dbl);
+
+  kernel[0] = (int)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+  kernel[1] = (int)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+  kernel[2] = (int)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+  kernel[3] = (int)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+static INLINE double get_cubic_value_dbl(const double *p,
+                                         const double kernel[4]) {
+  return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+         kernel[3] * p[3];
+}
+
+static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) {
+  return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+         kernel[3] * p[3];
+}
+
+static INLINE double bicubic_interp_one(const double *arr, int stride,
+                                        const double h_kernel[4],
+                                        const double v_kernel[4]) {
+  double tmp[1 * 4];
+
+  // Horizontal convolution
+  for (int i = -1; i < 3; ++i) {
+    tmp[i + 1] = get_cubic_value_dbl(&arr[i * stride - 1], h_kernel);
+  }
+
+  // Vertical convolution
+  return get_cubic_value_dbl(tmp, v_kernel);
+}
+
+static int determine_disflow_correspondence(const ImagePyramid *src_pyr,
+                                            const ImagePyramid *ref_pyr,
+                                            CornerList *corners,
+                                            const FlowField *flow,
+                                            Correspondence *correspondences) {
+  const int width = flow->width;
+  const int height = flow->height;
+  const int stride = flow->stride;
+
+  int num_correspondences = 0;
+  for (int i = 0; i < corners->num_corners; ++i) {
+    const int x0 = corners->corners[2 * i];
+    const int y0 = corners->corners[2 * i + 1];
+
+    // Offset points, to compensate for the fact that (say) a flow field entry
+    // at horizontal index i, is nominally associated with the pixel at
+    // horizontal coordinate (i << DOWNSAMPLE_FACTOR) + UPSAMPLE_CENTER_OFFSET
+    // This offset must be applied before we split the coordinate into integer
+    // and fractional parts, in order for the interpolation to be correct.
+    const int x = x0 - UPSAMPLE_CENTER_OFFSET;
+    const int y = y0 - UPSAMPLE_CENTER_OFFSET;
+
+    // Split the pixel coordinates into integer flow field coordinates and
+    // an offset for interpolation
+    const int flow_x = x >> DOWNSAMPLE_SHIFT;
+    const double flow_sub_x =
+        (x & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR;
+    const int flow_y = y >> DOWNSAMPLE_SHIFT;
+    const double flow_sub_y =
+        (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR;
+
+    // Exclude points which would sample from the outer border of the flow
+    // field, as this would give lower-quality results.
+    //
+    // Note: As we never read from the border region at pyramid level 0, we
+    // can skip filling it in. If the conditions here are removed, or any
+    // other logic is added which reads from this border region, then
+    // compute_flow_field() will need to be modified to call
+    // fill_flow_field_borders() at pyramid level 0 to set up the correct
+    // border data.
+    if (flow_x < 1 || (flow_x + 2) >= width) continue;
+    if (flow_y < 1 || (flow_y + 2) >= height) continue;
+
+    double h_kernel[4];
+    double v_kernel[4];
+    get_cubic_kernel_dbl(flow_sub_x, h_kernel);
+    get_cubic_kernel_dbl(flow_sub_y, v_kernel);
+
+    double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x],
+                                       stride, h_kernel, v_kernel);
+    double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x],
+                                       stride, h_kernel, v_kernel);
+
+    // Refine the interpolated flow vector one last time
+    const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER;
+    const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER;
+    aom_compute_flow_at_point(
+        src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x,
+        patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height,
+        src_pyr->layers[0].stride, &flow_u, &flow_v);
+
+    // Use original points (without offsets) when filling in correspondence
+    // array
+    correspondences[num_correspondences].x = x0;
+    correspondences[num_correspondences].y = y0;
+    correspondences[num_correspondences].rx = x0 + flow_u;
+    correspondences[num_correspondences].ry = y0 + flow_v;
+    num_correspondences++;
+  }
+  return num_correspondences;
+}
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+                                       int width, int height, int stride, int x,
+                                       int y, double u, double v,
+                                       const int16_t *dx, const int16_t *dy,
+                                       int *b) {
+  memset(b, 0, 2 * sizeof(*b));
+
+  // Split offset into integer and fractional parts, and compute cubic
+  // interpolation kernels
+  const int u_int = (int)floor(u);
+  const int v_int = (int)floor(v);
+  const double u_frac = u - floor(u);
+  const double v_frac = v - floor(v);
+
+  int h_kernel[4];
+  int v_kernel[4];
+  get_cubic_kernel_int(u_frac, h_kernel);
+  get_cubic_kernel_int(v_frac, v_kernel);
+
+  // Storage for intermediate values between the two convolution directions
+  int tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+  int *tmp = tmp_ + DISFLOW_PATCH_SIZE;  // Offset by one row
+
+  // Clamp coordinates so that all pixels we fetch will remain within the
+  // allocated border region, but allow them to go far enough out that
+  // the border pixels' values do not change.
+  // Since we are calculating an 8x8 block, the bottom-right pixel
+  // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+  // interpolation has 4 taps, meaning that the output of pixel
+  // (x_w, y_w) depends on the pixels in the range
+  // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+  //
+  // Thus the most extreme coordinates which will be fetched are
+  // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+  const int x0 = clamp(x + u_int, -9, width);
+  const int y0 = clamp(y + v_int, -9, height);
+
+  // Horizontal convolution
+  for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) {
+    const int y_w = y0 + i;
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+      const int x_w = x0 + j;
+      int arr[4];
+
+      arr[0] = (int)ref[y_w * stride + (x_w - 1)];
+      arr[1] = (int)ref[y_w * stride + (x_w + 0)];
+      arr[2] = (int)ref[y_w * stride + (x_w + 1)];
+      arr[3] = (int)ref[y_w * stride + (x_w + 2)];
+
+      // Apply kernel and round, keeping 6 extra bits of precision.
+      //
+      // 6 is the maximum allowable number of extra bits which will avoid
+      // the intermediate values overflowing an int16_t. The most extreme
+      // intermediate value occurs when:
+      // * The input pixels are [0, 255, 255, 0]
+      // * u_frac = 0.5
+      // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+      // As an integer with 6 fractional bits, that is 18360, which fits
+      // in an int16_t. But with 7 fractional bits it would be 36720,
+      // which is too large.
+      tmp[i * DISFLOW_PATCH_SIZE + j] = ROUND_POWER_OF_TWO(
+          get_cubic_value_int(arr, h_kernel), DISFLOW_INTERP_BITS - 6);
+    }
+  }
+
+  // Vertical convolution
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+      const int *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
+      const int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
+                           p[2 * DISFLOW_PATCH_SIZE] };
+      const int result = get_cubic_value_int(arr, v_kernel);
+
+      // Apply kernel and round.
+      // This time, we have to round off the 6 extra bits which were kept
+      // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+      // of precision to match the scale of the dx and dy arrays.
+      const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
+      const int warped = ROUND_POWER_OF_TWO(result, round_bits);
+      const int src_px = src[(x + j) + (y + i) * stride] << 3;
+      const int dt = warped - src_px;
+      b[0] += dx[i * DISFLOW_PATCH_SIZE + j] * dt;
+      b[1] += dy[i * DISFLOW_PATCH_SIZE + j] * dt;
+    }
+  }
+}
+
+static INLINE void sobel_filter(const uint8_t *src, int src_stride,
+                                int16_t *dst, int dst_stride, int dir) {
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+  int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+
+  // Sobel filter kernel
+  // This must have an overall scale factor equal to DISFLOW_DERIV_SCALE,
+  // in order to produce correctly scaled outputs.
+  // To work out the scale factor, we multiply two factors:
+  //
+  // * For the derivative filter (sobel_a), comparing our filter
+  //    image[x - 1] - image[x + 1]
+  //   to the standard form
+  //    d/dx image[x] = image[x+1] - image[x]
+  //   tells us that we're actually calculating -2 * d/dx image[2]
+  //
+  // * For the smoothing filter (sobel_b), all coefficients are positive
+  //   so the scale factor is just the sum of the coefficients
+  //
+  // Thus we need to make sure that DISFLOW_DERIV_SCALE = 2 * sum(sobel_b)
+  // (and take care of the - sign from sobel_a elsewhere)
+  static const int16_t sobel_a[3] = { 1, 0, -1 };
+  static const int16_t sobel_b[3] = { 1, 2, 1 };
+  const int taps = 3;
+
+  // horizontal filter
+  const int16_t *h_kernel = dir ? sobel_a : sobel_b;
+
+  for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += h_kernel[k] * src[y * src_stride + (x + k - 1)];
+      }
+      tmp[y * DISFLOW_PATCH_SIZE + x] = sum;
+    }
+  }
+
+  // vertical filter
+  const int16_t *v_kernel = dir ? sobel_b : sobel_a;
+
+  for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+      }
+      dst[y * dst_stride + x] = sum;
+    }
+  }
+}
+
+// Computes the components of the system of equations used to solve for
+// a flow vector.
+//
+// The flow equations are a least-squares system, derived as follows:
+//
+// For each pixel in the patch, we calculate the current error `dt`,
+// and the x and y gradients `dx` and `dy` of the source patch.
+// This means that, to first order, the squared error for this pixel is
+//
+//    (dt + u * dx + v * dy)^2
+//
+// where (u, v) are the incremental changes to the flow vector.
+//
+// We then want to find the values of u and v which minimize the sum
+// of the squared error across all pixels. Conveniently, this fits exactly
+// into the form of a least squares problem, with one equation
+//
+//   u * dx + v * dy = -dt
+//
+// for each pixel.
+//
+// Summing across all pixels in a square window of size DISFLOW_PATCH_SIZE,
+// and absorbing the - sign elsewhere, this results in the least squares system
+//
+//   M = |sum(dx * dx)  sum(dx * dy)|
+//       |sum(dx * dy)  sum(dy * dy)|
+//
+//   b = |sum(dx * dt)|
+//       |sum(dy * dt)|
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       double *M) {
+  int tmp[4] = { 0 };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
+      tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+      tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+      // Don't compute tmp[2], as it should be equal to tmp[1]
+      tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+    }
+  }
+
+  // Apply regularization
+  // We follow the standard regularization method of adding `k * I` before
+  // inverting. This ensures that the matrix will be invertible.
+  //
+  // Setting the regularization strength k to 1 seems to work well here, as
+  // typical values coming from the other equations are very large (1e5 to
+  // 1e6, with an upper limit of around 6e7, at the time of writing).
+  // It also preserves the property that all matrix values are whole numbers,
+  // which is convenient for integerized SIMD implementation.
+  tmp[0] += 1;
+  tmp[3] += 1;
+
+  tmp[2] = tmp[1];
+
+  M[0] = (double)tmp[0];
+  M[1] = (double)tmp[1];
+  M[2] = (double)tmp[2];
+  M[3] = (double)tmp[3];
+}
+
+// Try to invert the matrix M
+// Note: Due to the nature of how a least-squares matrix is constructed, all of
+// the eigenvalues will be >= 0, and therefore det M >= 0 as well.
+// The regularization term `+ k * I` further ensures that det M >= k^2.
+// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
+// So we don't have to worry about non-invertible matrices here.
+static INLINE void invert_2x2(const double *M, double *M_inv) {
+  double det = (M[0] * M[3]) - (M[1] * M[2]);
+  assert(det >= 1);
+  const double det_inv = 1 / det;
+
+  M_inv[0] = M[3] * det_inv;
+  M_inv[1] = -M[1] * det_inv;
+  M_inv[2] = -M[2] * det_inv;
+  M_inv[3] = M[0] * det_inv;
+}
+
+void aom_compute_flow_at_point_c(const uint8_t *src, const uint8_t *ref, int x,
+                                 int y, int width, int height, int stride,
+                                 double *u, double *v) {
+  double M[4];
+  double M_inv[4];
+  int b[2];
+  int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+  // Compute gradients within this patch
+  const uint8_t *src_patch = &src[y * stride + x];
+  sobel_filter(src_patch, stride, dx, DISFLOW_PATCH_SIZE, 1);
+  sobel_filter(src_patch, stride, dy, DISFLOW_PATCH_SIZE, 0);
+
+  compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M);
+  invert_2x2(M, M_inv);
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+                        b);
+
+    // Solve flow equations to find a better estimate for the flow vector
+    // at this point
+    const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+    const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+    *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+    *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+    if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+      // Stop iteration when we're close to convergence
+      break;
+    }
+  }
+}
+
+static void fill_flow_field_borders(double *flow, int width, int height,
+                                    int stride) {
+  // Calculate the bounds of the rectangle which was filled in by
+  // compute_flow_field() before calling this function.
+  // These indices are inclusive on both ends.
+  const int left_index = FLOW_BORDER_INNER;
+  const int right_index = (width - FLOW_BORDER_INNER - 1);
+  const int top_index = FLOW_BORDER_INNER;
+  const int bottom_index = (height - FLOW_BORDER_INNER - 1);
+
+  // Left area
+  for (int i = top_index; i <= bottom_index; i += 1) {
+    double *row = flow + i * stride;
+    const double left = row[left_index];
+    for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) {
+      row[j] = left;
+    }
+  }
+
+  // Right area
+  for (int i = top_index; i <= bottom_index; i += 1) {
+    double *row = flow + i * stride;
+    const double right = row[right_index];
+    for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) {
+      row[j] = right;
+    }
+  }
+
+  // Top area
+  const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER;
+  for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) {
+    double *row = flow + i * stride - FLOW_BORDER_OUTER;
+    size_t length = width + 2 * FLOW_BORDER_OUTER;
+    memcpy(row, top_row, length * sizeof(*row));
+  }
+
+  // Bottom area
+  const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER;
+  for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) {
+    double *row = flow + i * stride - FLOW_BORDER_OUTER;
+    size_t length = width + 2 * FLOW_BORDER_OUTER;
+    memcpy(row, bottom_row, length * sizeof(*row));
+  }
+}
+
+// Upscale one component of the flow field, from a size of
+// cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing
+// the result back into the same buffer. This function also scales the flow
+// vector by 2, so that when we move to the next pyramid level down, the implied
+// motion vector is the same.
+//
+// The temporary buffer tmpbuf must be large enough to hold an intermediate
+// array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and
+// below. In other words, indices from -FLOW_BORDER_OUTER * stride to
+// (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid.
+//
+// Note that the same stride is used for u before and after upscaling
+// and for the temporary buffer, for simplicity.
+//
+// A note on phasing:
+//
+// The flow fields at two adjacent pyramid levels are offset from each other,
+// and we need to account for this in the construction of the interpolation
+// kernels.
+//
+// Consider an 8x8 pixel patch at pyramid level n. This is split into four
+// patches at pyramid level n-1. Bringing these patches back up to pyramid level
+// n, each sub-patch covers 4x4 pixels, and between them they cover the same
+// 8x8 region.
+//
+// Therefore, at pyramid level n, two adjacent patches look like this:
+//
+//    + - - - - - - - + - - - - - - - +
+//    |               |               |
+//    |   x       x   |   x       x   |
+//    |               |               |
+//    |       #       |       #       |
+//    |               |               |
+//    |   x       x   |   x       x   |
+//    |               |               |
+//    + - - - - - - - + - - - - - - - +
+//
+// where # marks the center of a patch at pyramid level n (the input to this
+// function), and x marks the center of a patch at pyramid level n-1 (the output
+// of this function).
+//
+// By counting pixels (marked by +, -, and |), we can see that the flow vectors
+// at pyramid level n-1 are offset relative to the flow vectors at pyramid
+// level n, by 1/4 of the larger (input) patch size. Therefore, our
+// interpolation kernels need to have phases of 0.25 and 0.75.
+//
+// In addition, in order to handle the frame edges correctly, we need to
+// generate one output vector to the left and one to the right of each input
+// vector, even though these must be interpolated using different source points.
+static void upscale_flow_component(double *flow, int cur_width, int cur_height,
+                                   int stride, double *tmpbuf) {
+  const int half_len = FLOW_UPSCALE_TAPS / 2;
+
+  // Check that the outer border is large enough to avoid needing to clamp
+  // the source locations
+  assert(half_len <= FLOW_BORDER_OUTER);
+
+  // Horizontal upscale and multiply by 2
+  for (int i = 0; i < cur_height; i++) {
+    for (int j = 0; j < cur_width; j++) {
+      double left = 0;
+      for (int k = -half_len; k < half_len; k++) {
+        left +=
+            flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len];
+      }
+      tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left;
+
+      // Right output pixel is 0.25 units to the right of the input pixel
+      double right = 0;
+      for (int k = -(half_len - 1); k < (half_len + 1); k++) {
+        right += flow[i * stride + (j + k)] *
+                 flow_upscale_filter[1][k + (half_len - 1)];
+      }
+      tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right;
+    }
+  }
+
+  // Fill in top and bottom borders of tmpbuf
+  const double *top_row = &tmpbuf[0];
+  for (int i = -FLOW_BORDER_OUTER; i < 0; i++) {
+    double *row = &tmpbuf[i * stride];
+    memcpy(row, top_row, 2 * cur_width * sizeof(*row));
+  }
+
+  const double *bottom_row = &tmpbuf[(cur_height - 1) * stride];
+  for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) {
+    double *row = &tmpbuf[i * stride];
+    memcpy(row, bottom_row, 2 * cur_width * sizeof(*row));
+  }
+
+  // Vertical upscale
+  int upscaled_width = cur_width * 2;
+  for (int i = 0; i < cur_height; i++) {
+    for (int j = 0; j < upscaled_width; j++) {
+      double top = 0;
+      for (int k = -half_len; k < half_len; k++) {
+        top +=
+            tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len];
+      }
+      flow[(2 * i) * stride + j] = top;
+
+      double bottom = 0;
+      for (int k = -(half_len - 1); k < (half_len + 1); k++) {
+        bottom += tmpbuf[(i + k) * stride + j] *
+                  flow_upscale_filter[1][k + (half_len - 1)];
+      }
+      flow[(2 * i + 1) * stride + j] = bottom;
+    }
+  }
+}
+
+// make sure flow_u and flow_v start at 0
+static bool compute_flow_field(const ImagePyramid *src_pyr,
+                               const ImagePyramid *ref_pyr, FlowField *flow) {
+  bool mem_status = true;
+  assert(src_pyr->n_levels == ref_pyr->n_levels);
+
+  double *flow_u = flow->u;
+  double *flow_v = flow->v;
+
+  double *tmpbuf0;
+  double *tmpbuf;
+
+  if (src_pyr->n_levels < 2) {
+    // tmpbuf not needed
+    tmpbuf0 = NULL;
+    tmpbuf = NULL;
+  } else {
+    // This line must match the calculation of cur_flow_height below
+    const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT;
+
+    const size_t tmpbuf_size =
+        (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride;
+    tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0));
+    if (!tmpbuf0) {
+      mem_status = false;
+      goto free_tmpbuf;
+    }
+    tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride;
+  }
+
+  // Compute flow field from coarsest to finest level of the pyramid
+  //
+  // Note: We stop after refining pyramid level 1 and interpolating it to
+  // generate an initial flow field at level 0. We do *not* refine the dense
+  // flow field at level 0. Instead, we wait until we have generated
+  // correspondences by interpolating this flow field, and then refine the
+  // correspondences themselves. This is both faster and gives better output
+  // compared to refining the flow field at level 0 and then interpolating.
+  for (int level = src_pyr->n_levels - 1; level >= 1; --level) {
+    const PyramidLayer *cur_layer = &src_pyr->layers[level];
+    const int cur_width = cur_layer->width;
+    const int cur_height = cur_layer->height;
+    const int cur_stride = cur_layer->stride;
+
+    const uint8_t *src_buffer = cur_layer->buffer;
+    const uint8_t *ref_buffer = ref_pyr->layers[level].buffer;
+
+    const int cur_flow_width = cur_width >> DOWNSAMPLE_SHIFT;
+    const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT;
+    const int cur_flow_stride = flow->stride;
+
+    for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER;
+         i += 1) {
+      for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER;
+           j += 1) {
+        const int flow_field_idx = i * cur_flow_stride + j;
+
+        // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels,
+        // which is centered on the region covered by this flow field entry
+        const int patch_center_x =
+            (j << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET;  // In pixels
+        const int patch_center_y =
+            (i << DOWNSAMPLE_SHIFT) + UPSAMPLE_CENTER_OFFSET;  // In pixels
+        const int patch_tl_x = patch_center_x - DISFLOW_PATCH_CENTER;
+        const int patch_tl_y = patch_center_y - DISFLOW_PATCH_CENTER;
+        assert(patch_tl_x >= 0);
+        assert(patch_tl_y >= 0);
+
+        aom_compute_flow_at_point(src_buffer, ref_buffer, patch_tl_x,
+                                  patch_tl_y, cur_width, cur_height, cur_stride,
+                                  &flow_u[flow_field_idx],
+                                  &flow_v[flow_field_idx]);
+      }
+    }
+
+    // Fill in the areas which we haven't explicitly computed, with copies
+    // of the outermost values which we did compute
+    fill_flow_field_borders(flow_u, cur_flow_width, cur_flow_height,
+                            cur_flow_stride);
+    fill_flow_field_borders(flow_v, cur_flow_width, cur_flow_height,
+                            cur_flow_stride);
+
+    if (level > 0) {
+      const int upscale_flow_width = cur_flow_width << 1;
+      const int upscale_flow_height = cur_flow_height << 1;
+      const int upscale_stride = flow->stride;
+
+      upscale_flow_component(flow_u, cur_flow_width, cur_flow_height,
+                             cur_flow_stride, tmpbuf);
+      upscale_flow_component(flow_v, cur_flow_width, cur_flow_height,
+                             cur_flow_stride, tmpbuf);
+
+      // If we didn't fill in the rightmost column or bottommost row during
+      // upsampling (in order to keep the ratio to exactly 2), fill them
+      // in here by copying the next closest column/row
+      const PyramidLayer *next_layer = &src_pyr->layers[level - 1];
+      const int next_flow_width = next_layer->width >> DOWNSAMPLE_SHIFT;
+      const int next_flow_height = next_layer->height >> DOWNSAMPLE_SHIFT;
+
+      // Rightmost column
+      if (next_flow_width > upscale_flow_width) {
+        assert(next_flow_width == upscale_flow_width + 1);
+        for (int i = 0; i < upscale_flow_height; i++) {
+          const int index = i * upscale_stride + upscale_flow_width;
+          flow_u[index] = flow_u[index - 1];
+          flow_v[index] = flow_v[index - 1];
+        }
+      }
+
+      // Bottommost row
+      if (next_flow_height > upscale_flow_height) {
+        assert(next_flow_height == upscale_flow_height + 1);
+        for (int j = 0; j < next_flow_width; j++) {
+          const int index = upscale_flow_height * upscale_stride + j;
+          flow_u[index] = flow_u[index - upscale_stride];
+          flow_v[index] = flow_v[index - upscale_stride];
+        }
+      }
+    }
+  }
+
+free_tmpbuf:
+  aom_free(tmpbuf0);
+  return mem_status;
+}
+
+static FlowField *alloc_flow_field(int frame_width, int frame_height) {
+  FlowField *flow = (FlowField *)aom_malloc(sizeof(FlowField));
+  if (flow == NULL) return NULL;
+
+  // Calculate the size of the bottom (largest) layer of the flow pyramid
+  flow->width = frame_width >> DOWNSAMPLE_SHIFT;
+  flow->height = frame_height >> DOWNSAMPLE_SHIFT;
+  flow->stride = flow->width + 2 * FLOW_BORDER_OUTER;
+
+  const size_t flow_size =
+      flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER);
+
+  flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0));
+  if (!flow->buf0) {
+    aom_free(flow);
+    return NULL;
+  }
+
+  flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER;
+  flow->v = flow->u + flow_size;
+
+  return flow;
+}
+
+static void free_flow_field(FlowField *flow) {
+  aom_free(flow->buf0);
+  aom_free(flow);
+}
+
+// Compute flow field between `src` and `ref`, and then use that flow to
+// compute a global motion model relating the two frames.
+//
+// Following the convention in flow_estimation.h, the flow vectors are computed
+// at fixed points in `src` and point to the corresponding locations in `ref`,
+// regardless of the temporal ordering of the frames.
+bool av1_compute_global_motion_disflow(TransformationType type,
+                                       YV12_BUFFER_CONFIG *src,
+                                       YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                       MotionModel *motion_models,
+                                       int num_motion_models,
+                                       bool *mem_alloc_failed) {
+  // Precompute information we will need about each frame
+  ImagePyramid *src_pyramid = src->y_pyramid;
+  CornerList *src_corners = src->corners;
+  ImagePyramid *ref_pyramid = ref->y_pyramid;
+  if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!av1_compute_corner_list(src_pyramid, src_corners)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+  if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+
+  const int src_width = src_pyramid->layers[0].width;
+  const int src_height = src_pyramid->layers[0].height;
+  assert(ref_pyramid->layers[0].width == src_width);
+  assert(ref_pyramid->layers[0].height == src_height);
+
+  FlowField *flow = alloc_flow_field(src_width, src_height);
+  if (!flow) {
+    *mem_alloc_failed = true;
+    return false;
+  }
+
+  if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) {
+    *mem_alloc_failed = true;
+    free_flow_field(flow);
+    return false;
+  }
+
+  // find correspondences between the two images using the flow field
+  Correspondence *correspondences =
+      aom_malloc(src_corners->num_corners * sizeof(*correspondences));
+  if (!correspondences) {
+    *mem_alloc_failed = true;
+    free_flow_field(flow);
+    return false;
+  }
+
+  const int num_correspondences = determine_disflow_correspondence(
+      src_pyramid, ref_pyramid, src_corners, flow, correspondences);
+
+  bool result = ransac(correspondences, num_correspondences, type,
+                       motion_models, num_motion_models, mem_alloc_failed);
+
+  aom_free(correspondences);
+  free_flow_field(flow);
+  return result;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/disflow.h b/third_party/aom/aom_dsp/flow_estimation/disflow.h
new file mode 100644
index 0000000000..ef877b638c
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/disflow.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
+
+#include <stdbool.h>
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_dsp/rect.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Number of pyramid levels in disflow computation
+#define DISFLOW_PYRAMID_LEVELS 12
+
+// Size of square patches in the disflow dense grid
+// Must be a power of 2
+#define DISFLOW_PATCH_SIZE_LOG2 3
+#define DISFLOW_PATCH_SIZE (1 << DISFLOW_PATCH_SIZE_LOG2)
+// Center point of square patch
+#define DISFLOW_PATCH_CENTER ((DISFLOW_PATCH_SIZE / 2) - 1)
+
+// Overall scale of the `dx`, `dy` and `dt` arrays in the disflow code
+// In other words, the various derivatives are calculated with an internal
+// precision of (8 + DISFLOW_DERIV_SCALE_LOG2) bits, from an 8-bit input.
+//
+// This must be carefully synchronized with the code in sobel_filter()
+// (which fills the dx and dy arrays) and compute_flow_error() (which
+// fills dt); see the comments in those functions for more details
+#define DISFLOW_DERIV_SCALE_LOG2 3
+#define DISFLOW_DERIV_SCALE (1 << DISFLOW_DERIV_SCALE_LOG2)
+
+// Scale factor applied to each step in the main refinement loop
+//
+// This should be <= 1.0 to avoid overshoot. Values below 1.0
+// may help in some cases, but slow convergence overall, so
+// will require careful tuning.
+// TODO(rachelbarker): Tune this value
+#define DISFLOW_STEP_SIZE 1.0
+
+// Step size at which we should terminate iteration
+// The idea here is that, if we take a step which is much smaller than 1px in
+// size, then the values won't change much from iteration to iteration, so
+// many future steps will also be small, and that won't have much effect
+// on the ultimate result. So we can terminate early.
+//
+// To look at it another way, when we take a small step, that means that
+// either we're near to convergence (so can stop), or we're stuck in a
+// shallow valley and will take many iterations to get unstuck.
+//
+// Solving the latter properly requires fancier methods, such as "gradient
+// descent with momentum". For now, we terminate to avoid wasting a ton of
+// time on points which are either nearly-converged or stuck.
+//
+// Terminating at 1/8 px seems to give good results for global motion estimation
+#define DISFLOW_STEP_SIZE_THRESOLD (1. / 8.)
+
+// Max number of iterations if warp convergence is not found
+#define DISFLOW_MAX_ITR 4
+
+// Internal precision of cubic interpolation filters
+// The limiting factor here is that:
+// * Before integerizing, the maximum value of any kernel tap is 1.0
+// * After integerizing, each tap must fit into an int16_t.
+// Thus the largest multiplier we can get away with is 2^14 = 16384,
+// as 2^15 = 32768 is too large to fit in an int16_t.
+#define DISFLOW_INTERP_BITS 14
+
+typedef struct {
+  // Start of allocation for u and v buffers
+  double *buf0;
+
+  // x and y directions of flow, per patch
+  double *u;
+  double *v;
+
+  // Sizes of the above arrays
+  int width;
+  int height;
+  int stride;
+} FlowField;
+
+bool av1_compute_global_motion_disflow(TransformationType type,
+                                       YV12_BUFFER_CONFIG *src,
+                                       YV12_BUFFER_CONFIG *ref, int bit_depth,
+                                       MotionModel *motion_models,
+                                       int num_motion_models,
+                                       bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_DISFLOW_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c
new file mode 100644
index 0000000000..0f47f86f55
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+// For each global motion method, how many pyramid levels should we allocate?
+// Note that this is a maximum, and fewer levels will be allocated if the frame
+// is not large enough to need all of the specified levels
+const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = {
+  1,   // GLOBAL_MOTION_METHOD_FEATURE_MATCH
+  16,  // GLOBAL_MOTION_METHOD_DISFLOW
+};
+
+// clang-format off
+const double kIdentityParams[MAX_PARAMDIM] = {
+  0.0, 0.0, 1.0, 0.0, 0.0, 1.0
+};
+// clang-format on
+
+// Compute a global motion model between the given source and ref frames.
+//
+// As is standard for video codecs, the resulting model maps from (x, y)
+// coordinates in `src` to the corresponding points in `ref`, regardless
+// of the temporal order of the two frames.
+//
+// Returns true if global motion estimation succeeded, false if not.
+// The output models should only be used if this function succeeds.
+bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *ref, int bit_depth,
+                               GlobalMotionMethod gm_method,
+                               MotionModel *motion_models,
+                               int num_motion_models, bool *mem_alloc_failed) {
+  switch (gm_method) {
+    case GLOBAL_MOTION_METHOD_FEATURE_MATCH:
+      return av1_compute_global_motion_feature_match(
+          type, src, ref, bit_depth, motion_models, num_motion_models,
+          mem_alloc_failed);
+    case GLOBAL_MOTION_METHOD_DISFLOW:
+      return av1_compute_global_motion_disflow(type, src, ref, bit_depth,
+                                               motion_models, num_motion_models,
+                                               mem_alloc_failed);
+    default: assert(0 && "Unknown global motion estimation type");
+  }
+  return false;
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h
new file mode 100644
index 0000000000..2dfae24980
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/flow_estimation.h
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_H_
+
+#include "aom_dsp/pyramid.h"
+#include "aom_dsp/flow_estimation/corner_detect.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MAX_PARAMDIM 6
+#define MIN_INLIER_PROB 0.1
+
+/* clang-format off */
+enum {
+  IDENTITY = 0,      // identity transformation, 0-parameter
+  TRANSLATION = 1,   // translational motion 2-parameter
+  ROTZOOM = 2,       // simplified affine with rotation + zoom only, 4-parameter
+  AFFINE = 3,        // affine, 6-parameter
+  TRANS_TYPES,
+} UENUM1BYTE(TransformationType);
+/* clang-format on */
+
+// number of parameters used by each transformation in TransformationTypes
+static const int trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
+
+// Available methods which can be used for global motion estimation
+typedef enum {
+  GLOBAL_MOTION_METHOD_FEATURE_MATCH,
+  GLOBAL_MOTION_METHOD_DISFLOW,
+  GLOBAL_MOTION_METHOD_LAST = GLOBAL_MOTION_METHOD_DISFLOW,
+  GLOBAL_MOTION_METHODS
+} GlobalMotionMethod;
+
+typedef struct {
+  double params[MAX_PARAMDIM];
+  int *inliers;
+  int num_inliers;
+} MotionModel;
+
+// Data structure to store a single correspondence point during global
+// motion search.
+//
+// A correspondence (x, y) -> (rx, ry) means that point (x, y) in the
+// source frame corresponds to point (rx, ry) in the ref frame.
+typedef struct {
+  double x, y;
+  double rx, ry;
+} Correspondence;
+
+// For each global motion method, how many pyramid levels should we allocate?
+// Note that this is a maximum, and fewer levels will be allocated if the frame
+// is not large enough to need all of the specified levels
+extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS];
+
+// Which global motion method should we use in practice?
+// Disflow is both faster and gives better results than feature matching in
+// practically all cases, so we use disflow by default
+static const GlobalMotionMethod default_global_motion_method =
+    GLOBAL_MOTION_METHOD_DISFLOW;
+
+extern const double kIdentityParams[MAX_PARAMDIM];
+
+// Compute a global motion model between the given source and ref frames.
+//
+// As is standard for video codecs, the resulting model maps from (x, y)
+// coordinates in `src` to the corresponding points in `ref`, regardless
+// of the temporal order of the two frames.
+//
+// Returns true if global motion estimation succeeded, false if not.
+// The output models should only be used if this function succeeds.
+bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src,
+                               YV12_BUFFER_CONFIG *ref, int bit_depth,
+                               GlobalMotionMethod gm_method,
+                               MotionModel *motion_models,
+                               int num_motion_models, bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.c b/third_party/aom/aom_dsp/flow_estimation/ransac.c
new file mode 100644
index 0000000000..b88a07b023
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/ransac.c
@@ -0,0 +1,484 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <memory.h>
+#include <math.h>
+#include <time.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include <string.h>
+#include <assert.h>
+
+#include "aom_dsp/flow_estimation/ransac.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_mem/aom_mem.h"
+
+// TODO(rachelbarker): Remove dependence on code in av1/encoder/
+#include "av1/encoder/random.h"
+
+#define MAX_MINPTS 4
+#define MINPTS_MULTIPLIER 5
+
+#define INLIER_THRESHOLD 1.25
+#define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD)
+#define NUM_TRIALS 20
+
+// Flag to enable functions for finding TRANSLATION type models.
+//
+// These modes are not considered currently due to a spec bug (see comments
+// in gm_get_motion_vector() in av1/common/mv.h). Thus we don't need to compile
+// the corresponding search functions, but it is nice to keep the source around
+// but disabled, for completeness.
+#define ALLOW_TRANSLATION_MODELS 0
+
+////////////////////////////////////////////////////////////////////////////////
+// ransac
+typedef bool (*IsDegenerateFunc)(double *p);
+typedef bool (*FindTransformationFunc)(int points, const double *points1,
+                                       const double *points2, double *params);
+typedef void (*ProjectPointsFunc)(const double *mat, const double *points,
+                                  double *proj, int n, int stride_points,
+                                  int stride_proj);
+
+// vtable-like structure which stores all of the information needed by RANSAC
+// for a particular model type
+typedef struct {
+  IsDegenerateFunc is_degenerate;
+  FindTransformationFunc find_transformation;
+  ProjectPointsFunc project_points;
+  int minpts;
+} RansacModelInfo;
+
+#if ALLOW_TRANSLATION_MODELS
+static void project_points_translation(const double *mat, const double *points,
+                                       double *proj, int n, int stride_points,
+                                       int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = x + mat[0];
+    *(proj++) = y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+#endif  // ALLOW_TRANSLATION_MODELS
+
+static void project_points_affine(const double *mat, const double *points,
+                                  double *proj, int n, int stride_points,
+                                  int stride_proj) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const double x = *(points++), y = *(points++);
+    *(proj++) = mat[2] * x + mat[3] * y + mat[0];
+    *(proj++) = mat[4] * x + mat[5] * y + mat[1];
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+#if ALLOW_TRANSLATION_MODELS
+static bool find_translation(int np, const double *pts1, const double *pts2,
+                             double *params) {
+  double sumx = 0;
+  double sumy = 0;
+
+  for (int i = 0; i < np; ++i) {
+    double dx = *(pts2++);
+    double dy = *(pts2++);
+    double sx = *(pts1++);
+    double sy = *(pts1++);
+
+    sumx += dx - sx;
+    sumy += dy - sy;
+  }
+
+  params[0] = sumx / np;
+  params[1] = sumy / np;
+  params[2] = 1;
+  params[3] = 0;
+  params[4] = 0;
+  params[5] = 1;
+  return true;
+}
+#endif  // ALLOW_TRANSLATION_MODELS
+
+static bool find_rotzoom(int np, const double *pts1, const double *pts2,
+                         double *params) {
+  const int n = 4;    // Size of least-squares problem
+  double mat[4 * 4];  // Accumulator for A'A
+  double y[4];        // Accumulator for A'b
+  double a[4];        // Single row of A
+  double b;           // Single element of b
+
+  least_squares_init(mat, y, n);
+  for (int i = 0; i < np; ++i) {
+    double dx = *(pts2++);
+    double dy = *(pts2++);
+    double sx = *(pts1++);
+    double sy = *(pts1++);
+
+    a[0] = 1;
+    a[1] = 0;
+    a[2] = sx;
+    a[3] = sy;
+    b = dx;
+    least_squares_accumulate(mat, y, a, b, n);
+
+    a[0] = 0;
+    a[1] = 1;
+    a[2] = sy;
+    a[3] = -sx;
+    b = dy;
+    least_squares_accumulate(mat, y, a, b, n);
+  }
+
+  // Fill in params[0] .. params[3] with output model
+  if (!least_squares_solve(mat, y, params, n)) {
+    return false;
+  }
+
+  // Fill in remaining parameters
+  params[4] = -params[3];
+  params[5] = params[2];
+
+  return true;
+}
+
+static bool find_affine(int np, const double *pts1, const double *pts2,
+                        double *params) {
+  // Note: The least squares problem for affine models is 6-dimensional,
+  // but it splits into two independent 3-dimensional subproblems.
+  // Solving these two subproblems separately and recombining at the end
+  // results in less total computation than solving the 6-dimensional
+  // problem directly.
+  //
+  // The two subproblems correspond to all the parameters which contribute
+  // to the x output of the model, and all the parameters which contribute
+  // to the y output, respectively.
+
+  const int n = 3;       // Size of each least-squares problem
+  double mat[2][3 * 3];  // Accumulator for A'A
+  double y[2][3];        // Accumulator for A'b
+  double x[2][3];        // Output vector
+  double a[2][3];        // Single row of A
+  double b[2];           // Single element of b
+
+  least_squares_init(mat[0], y[0], n);
+  least_squares_init(mat[1], y[1], n);
+  for (int i = 0; i < np; ++i) {
+    double dx = *(pts2++);
+    double dy = *(pts2++);
+    double sx = *(pts1++);
+    double sy = *(pts1++);
+
+    a[0][0] = 1;
+    a[0][1] = sx;
+    a[0][2] = sy;
+    b[0] = dx;
+    least_squares_accumulate(mat[0], y[0], a[0], b[0], n);
+
+    a[1][0] = 1;
+    a[1][1] = sx;
+    a[1][2] = sy;
+    b[1] = dy;
+    least_squares_accumulate(mat[1], y[1], a[1], b[1], n);
+  }
+
+  if (!least_squares_solve(mat[0], y[0], x[0], n)) {
+    return false;
+  }
+  if (!least_squares_solve(mat[1], y[1], x[1], n)) {
+    return false;
+  }
+
+  // Rearrange least squares result to form output model
+  params[0] = x[0][0];
+  params[1] = x[1][0];
+  params[2] = x[0][1];
+  params[3] = x[0][2];
+  params[4] = x[1][1];
+  params[5] = x[1][2];
+
+  return true;
+}
+
+typedef struct {
+  int num_inliers;
+  double sse;  // Sum of squared errors of inliers
+  int *inlier_indices;
+} RANSAC_MOTION;
+
+// Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise.
+static int compare_motions(const void *arg_a, const void *arg_b) {
+  const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a;
+  const RANSAC_MOTION *motion_b = (RANSAC_MOTION *)arg_b;
+
+  if (motion_a->num_inliers > motion_b->num_inliers) return -1;
+  if (motion_a->num_inliers < motion_b->num_inliers) return 1;
+  if (motion_a->sse < motion_b->sse) return -1;
+  if (motion_a->sse > motion_b->sse) return 1;
+  return 0;
+}
+
+static bool is_better_motion(const RANSAC_MOTION *motion_a,
+                             const RANSAC_MOTION *motion_b) {
+  return compare_motions(motion_a, motion_b) < 0;
+}
+
+static void copy_points_at_indices(double *dest, const double *src,
+                                   const int *indices, int num_points) {
+  for (int i = 0; i < num_points; ++i) {
+    const int index = indices[i];
+    dest[i * 2] = src[index * 2];
+    dest[i * 2 + 1] = src[index * 2 + 1];
+  }
+}
+
+// Returns true on success, false on error
+static bool ransac_internal(const Correspondence *matched_points, int npoints,
+                            MotionModel *motion_models, int num_desired_motions,
+                            const RansacModelInfo *model_info,
+                            bool *mem_alloc_failed) {
+  assert(npoints >= 0);
+  int i = 0;
+  int minpts = model_info->minpts;
+  bool ret_val = true;
+
+  unsigned int seed = (unsigned int)npoints;
+
+  int indices[MAX_MINPTS] = { 0 };
+
+  double *points1, *points2;
+  double *corners1, *corners2;
+  double *projected_corners;
+
+  // Store information for the num_desired_motions best transformations found
+  // and the worst motion among them, as well as the motion currently under
+  // consideration.
+  RANSAC_MOTION *motions, *worst_kept_motion = NULL;
+  RANSAC_MOTION current_motion;
+
+  // Store the parameters and the indices of the inlier points for the motion
+  // currently under consideration.
+  double params_this_motion[MAX_PARAMDIM];
+
+  if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) {
+    return false;
+  }
+
+  int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts);
+
+  points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2);
+  points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2);
+  corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2);
+  corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2);
+  projected_corners =
+      (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2);
+  motions =
+      (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION));
+
+  // Allocate one large buffer which will be carved up to store the inlier
+  // indices for the current motion plus the num_desired_motions many
+  // output models
+  // This allows us to keep the allocation/deallocation logic simple, without
+  // having to (for example) check that `motions` is non-null before allocating
+  // the inlier arrays
+  int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints *
+                                         (num_desired_motions + 1));
+
+  if (!(points1 && points2 && corners1 && corners2 && projected_corners &&
+        motions && inlier_buffer)) {
+    ret_val = false;
+    *mem_alloc_failed = true;
+    goto finish_ransac;
+  }
+
+  // Once all our allocations are known-good, we can fill in our structures
+  worst_kept_motion = motions;
+
+  for (i = 0; i < num_desired_motions; ++i) {
+    motions[i].inlier_indices = inlier_buffer + i * npoints;
+  }
+  memset(&current_motion, 0, sizeof(current_motion));
+  current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints;
+
+  for (i = 0; i < npoints; ++i) {
+    corners1[2 * i + 0] = matched_points[i].x;
+    corners1[2 * i + 1] = matched_points[i].y;
+    corners2[2 * i + 0] = matched_points[i].rx;
+    corners2[2 * i + 1] = matched_points[i].ry;
+  }
+
+  for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) {
+    lcg_pick(npoints, minpts, indices, &seed);
+
+    copy_points_at_indices(points1, corners1, indices, minpts);
+    copy_points_at_indices(points2, corners2, indices, minpts);
+
+    if (model_info->is_degenerate(points1)) {
+      continue;
+    }
+
+    if (!model_info->find_transformation(minpts, points1, points2,
+                                         params_this_motion)) {
+      continue;
+    }
+
+    model_info->project_points(params_this_motion, corners1, projected_corners,
+                               npoints, 2, 2);
+
+    current_motion.num_inliers = 0;
+    double sse = 0.0;
+    for (i = 0; i < npoints; ++i) {
+      double dx = projected_corners[i * 2] - corners2[i * 2];
+      double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1];
+      double squared_error = dx * dx + dy * dy;
+
+      if (squared_error < INLIER_THRESHOLD_SQUARED) {
+        current_motion.inlier_indices[current_motion.num_inliers++] = i;
+        sse += squared_error;
+      }
+    }
+
+    if (current_motion.num_inliers < min_inliers) {
+      // Reject models with too few inliers
+      continue;
+    }
+
+    current_motion.sse = sse;
+    if (is_better_motion(&current_motion, worst_kept_motion)) {
+      // This motion is better than the worst currently kept motion. Remember
+      // the inlier points and sse. The parameters for each kept motion
+      // will be recomputed later using only the inliers.
+      worst_kept_motion->num_inliers = current_motion.num_inliers;
+      worst_kept_motion->sse = current_motion.sse;
+
+      // Rather than copying the (potentially many) inlier indices from
+      // current_motion.inlier_indices to worst_kept_motion->inlier_indices,
+      // we can swap the underlying pointers.
+      //
+      // This is okay because the next time current_motion.inlier_indices
+      // is used will be in the next trial, where we ignore its previous
+      // contents anyway. And both arrays will be deallocated together at the
+      // end of this function, so there are no lifetime issues.
+      int *tmp = worst_kept_motion->inlier_indices;
+      worst_kept_motion->inlier_indices = current_motion.inlier_indices;
+      current_motion.inlier_indices = tmp;
+
+      // Determine the new worst kept motion and its num_inliers and sse.
+      for (i = 0; i < num_desired_motions; ++i) {
+        if (is_better_motion(worst_kept_motion, &motions[i])) {
+          worst_kept_motion = &motions[i];
+        }
+      }
+    }
+  }
+
+  // Sort the motions, best first.
+  qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions);
+
+  // Recompute the motions using only the inliers.
+  for (i = 0; i < num_desired_motions; ++i) {
+    int num_inliers = motions[i].num_inliers;
+    if (num_inliers > 0) {
+      assert(num_inliers >= minpts);
+
+      copy_points_at_indices(points1, corners1, motions[i].inlier_indices,
+                             num_inliers);
+      copy_points_at_indices(points2, corners2, motions[i].inlier_indices,
+                             num_inliers);
+
+      if (!model_info->find_transformation(num_inliers, points1, points2,
+                                           motion_models[i].params)) {
+        // In the unlikely event that this model fitting fails,
+        // we don't have a good fallback. So just clear the output
+        // model and move on
+        memcpy(motion_models[i].params, kIdentityParams,
+               MAX_PARAMDIM * sizeof(*(motion_models[i].params)));
+        motion_models[i].num_inliers = 0;
+        continue;
+      }
+
+      // Populate inliers array
+      for (int j = 0; j < num_inliers; j++) {
+        int index = motions[i].inlier_indices[j];
+        const Correspondence *corr = &matched_points[index];
+        motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x);
+        motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y);
+      }
+      motion_models[i].num_inliers = num_inliers;
+    } else {
+      memcpy(motion_models[i].params, kIdentityParams,
+             MAX_PARAMDIM * sizeof(*(motion_models[i].params)));
+      motion_models[i].num_inliers = 0;
+    }
+  }
+
+finish_ransac:
+  aom_free(inlier_buffer);
+  aom_free(motions);
+  aom_free(projected_corners);
+  aom_free(corners2);
+  aom_free(corners1);
+  aom_free(points2);
+  aom_free(points1);
+
+  return ret_val;
+}
+
+static bool is_collinear3(double *p1, double *p2, double *p3) {
+  static const double collinear_eps = 1e-3;
+  const double v =
+      (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]);
+  return fabs(v) < collinear_eps;
+}
+
+#if ALLOW_TRANSLATION_MODELS
+static bool is_degenerate_translation(double *p) {
+  return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2;
+}
+#endif  // ALLOW_TRANSLATION_MODELS
+
+static bool is_degenerate_affine(double *p) {
+  return is_collinear3(p, p + 2, p + 4);
+}
+
+static const RansacModelInfo ransac_model_info[TRANS_TYPES] = {
+  // IDENTITY
+  { NULL, NULL, NULL, 0 },
+// TRANSLATION
+#if ALLOW_TRANSLATION_MODELS
+  { is_degenerate_translation, find_translation, project_points_translation,
+    3 },
+#else
+  { NULL, NULL, NULL, 0 },
+#endif
+  // ROTZOOM
+  { is_degenerate_affine, find_rotzoom, project_points_affine, 3 },
+  // AFFINE
+  { is_degenerate_affine, find_affine, project_points_affine, 3 },
+};
+
+// Returns true on success, false on error
+bool ransac(const Correspondence *matched_points, int npoints,
+            TransformationType type, MotionModel *motion_models,
+            int num_desired_motions, bool *mem_alloc_failed) {
+#if ALLOW_TRANSLATION_MODELS
+  assert(type > IDENTITY && type < TRANS_TYPES);
+#else
+  assert(type > TRANSLATION && type < TRANS_TYPES);
+#endif  // ALLOW_TRANSLATION_MODELS
+
+  return ransac_internal(matched_points, npoints, motion_models,
+                         num_desired_motions, &ransac_model_info[type],
+                         mem_alloc_failed);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/ransac.h b/third_party/aom/aom_dsp/flow_estimation/ransac.h
new file mode 100644
index 0000000000..0529b6e13c
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/ransac.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
+#define AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#include <memory.h>
+#include <stdbool.h>
+
+#include "aom_dsp/flow_estimation/flow_estimation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+bool ransac(const Correspondence *matched_points, int npoints,
+            TransformationType type, MotionModel *motion_models,
+            int num_desired_motions, bool *mem_alloc_failed);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_FLOW_ESTIMATION_RANSAC_H_
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c
new file mode 100644
index 0000000000..87c76fa13b
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_avx2.c
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_avx2(const unsigned char *frame1,
+                                          int stride1, int x1, int y1,
+                                          const unsigned char *frame2,
+                                          int stride2, int x2, int y2) {
+  int i, stride1_i = 0, stride2_i = 0;
+  __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1;
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m256i zero = _mm256_setzero_si256();
+  __m128i v1, v2;
+
+  sum_vec = zero;
+  sumsq2_vec = zero;
+  cross_vec = zero;
+
+  frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask);
+    v1_1 = _mm256_cvtepu8_epi16(v1);
+    v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask);
+    v2_1 = _mm256_cvtepu8_epi16(v2);
+
+    v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1);
+    sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1));
+
+    sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero));
+    cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1));
+    stride1_i += stride1;
+    stride2_i += stride2;
+  }
+  __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8);
+  sum_vec = _mm256_add_epi32(sum_vec, sum_vec1);
+  int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec));
+  int sum2_acc = _mm256_extract_epi32(sum_vec, 4);
+
+  __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec);
+  __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec);
+  temp1 = _mm256_add_epi32(unp_low, unp_hig);
+
+  __m128i low_sumsq = _mm256_castsi256_si128(temp1);
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1));
+  low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32));
+  int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq);
+  int cross_acc = _mm_extract_epi32(low_sumsq, 2);
+
+  int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc;
+  int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc;
+  return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c
new file mode 100644
index 0000000000..b3cb5bc5fd
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/corner_match_sse4.c
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/flow_estimation/corner_match.h"
+
+DECLARE_ALIGNED(16, static const uint8_t,
+                byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255,
+                                   255, 255, 255, 255, 255, 0,   0,   0 };
+#if MATCH_SZ != 13
+#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13"
+#endif
+
+/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the
+   correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows
+   of each image, centered at (x1, y1) and (x2, y2) respectively.
+*/
+double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1,
+                                            int stride1, int x1, int y1,
+                                            const unsigned char *frame2,
+                                            int stride2, int x2, int y2) {
+  int i;
+  // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0,
+  // 2)
+  __m128i sum1_vec = _mm_setzero_si128();
+  __m128i sum2_vec = _mm_setzero_si128();
+  // 4 32-bit partial sums of squares
+  __m128i sumsq2_vec = _mm_setzero_si128();
+  __m128i cross_vec = _mm_setzero_si128();
+
+  const __m128i mask = _mm_load_si128((__m128i *)byte_mask);
+  const __m128i zero = _mm_setzero_si128();
+
+  frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2);
+  frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2);
+
+  for (i = 0; i < MATCH_SZ; ++i) {
+    const __m128i v1 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask);
+    const __m128i v2 =
+        _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask);
+
+    // Using the 'sad' intrinsic here is a bit faster than adding
+    // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit
+    // conversion step later, for a net speedup of ~10%
+    sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero));
+    sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero));
+
+    const __m128i v1_l = _mm_cvtepu8_epi16(v1);
+    const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8));
+    const __m128i v2_l = _mm_cvtepu8_epi16(v2);
+    const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8));
+
+    sumsq2_vec = _mm_add_epi32(
+        sumsq2_vec,
+        _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r)));
+    cross_vec = _mm_add_epi32(
+        cross_vec,
+        _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r)));
+  }
+
+  // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec,
+  // cross_vec)
+  // as holding 4 32-bit elements each, which we want to sum horizontally.
+  // We do this by transposing and then summing vertically.
+  __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec);
+  __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec);
+  __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec);
+
+  __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2);
+  __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2);
+  __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3);
+  __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3);
+
+  __m128i res =
+      _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7));
+
+  int sum1 = _mm_extract_epi32(res, 0);
+  int sum2 = _mm_extract_epi32(res, 1);
+  int sumsq2 = _mm_extract_epi32(res, 2);
+  int cross = _mm_extract_epi32(res, 3);
+
+  int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2;
+  int cov = cross * MATCH_SZ_SQ - sum1 * sum2;
+  return cov / sqrt((double)var2);
+}
diff --git a/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
new file mode 100644
index 0000000000..d2b04c1973
--- /dev/null
+++ b/third_party/aom/aom_dsp/flow_estimation/x86/disflow_sse4.c
@@ -0,0 +1,558 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 3-Clause Clear License
+ * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
+ * License was not distributed with this source code in the LICENSE file, you
+ * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
+ * Alliance for Open Media Patent License 1.0 was not distributed with this
+ * source code in the PATENTS file, you can obtain it at
+ * aomedia.org/license/patent-license/.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/flow_estimation/disflow.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// Internal cross-check against C code
+// If you set this to 1 and compile in debug mode, then the outputs of the two
+// convolution stages will be checked against the plain C version of the code,
+// and an assertion will be fired if the results differ.
+#define CHECK_RESULTS 0
+
+// Note: Max sum(+ve coefficients) = 1.125 * scale
+static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) {
+  // Check that the fractional position is in range.
+  //
+  // Note: x is calculated from (eg.) `u_frac = u - floor(u)`.
+  // Mathematically, this implies that 0 <= x < 1. However, in practice it is
+  // possible to have x == 1 due to floating point rounding. This is fine,
+  // and we still interpolate correctly if we allow x = 1.
+  assert(0 <= x && x <= 1);
+
+  double x2 = x * x;
+  double x3 = x2 * x;
+  kernel[0] = -0.5 * x + x2 - 0.5 * x3;
+  kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3;
+  kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3;
+  kernel[3] = -0.5 * x2 + 0.5 * x3;
+}
+
+static INLINE void get_cubic_kernel_int(double x, int16_t kernel[4]) {
+  double kernel_dbl[4];
+  get_cubic_kernel_dbl(x, kernel_dbl);
+
+  kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS));
+  kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS));
+  kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS));
+  kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS));
+}
+
+#if CHECK_RESULTS
+static INLINE int get_cubic_value_int(const int *p, const int16_t kernel[4]) {
+  return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] +
+         kernel[3] * p[3];
+}
+#endif  // CHECK_RESULTS
+
+// Compare two regions of width x height pixels, one rooted at position
+// (x, y) in src and the other at (x + u, y + v) in ref.
+// This function returns the sum of squared pixel differences between
+// the two regions.
+//
+// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation
+// instad of bicubic interpolation
+static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref,
+                                       int width, int height, int stride, int x,
+                                       int y, double u, double v,
+                                       const int16_t *dx, const int16_t *dy,
+                                       int *b) {
+  // This function is written to do 8x8 convolutions only
+  assert(DISFLOW_PATCH_SIZE == 8);
+
+  // Accumulate 4 32-bit partial sums for each element of b
+  // These will be flattened at the end.
+  __m128i b0_acc = _mm_setzero_si128();
+  __m128i b1_acc = _mm_setzero_si128();
+#if CHECK_RESULTS
+  // Also keep a running sum using the C algorithm, for cross-checking
+  int c_result[2] = { 0 };
+#endif  // CHECK_RESULTS
+
+  // Split offset into integer and fractional parts, and compute cubic
+  // interpolation kernels
+  const int u_int = (int)floor(u);
+  const int v_int = (int)floor(v);
+  const double u_frac = u - floor(u);
+  const double v_frac = v - floor(v);
+
+  int16_t h_kernel[4];
+  int16_t v_kernel[4];
+  get_cubic_kernel_int(u_frac, h_kernel);
+  get_cubic_kernel_int(v_frac, v_kernel);
+
+  // Storage for intermediate values between the two convolution directions
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)];
+  int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;  // Offset by one row
+
+  // Clamp coordinates so that all pixels we fetch will remain within the
+  // allocated border region, but allow them to go far enough out that
+  // the border pixels' values do not change.
+  // Since we are calculating an 8x8 block, the bottom-right pixel
+  // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic
+  // interpolation has 4 taps, meaning that the output of pixel
+  // (x_w, y_w) depends on the pixels in the range
+  // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]).
+  //
+  // Thus the most extreme coordinates which will be fetched are
+  // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9).
+  const int x0 = clamp(x + u_int, -9, width);
+  const int y0 = clamp(y + v_int, -9, height);
+
+  // Horizontal convolution
+
+  // Prepare the kernel vectors
+  // We split the kernel into two vectors with kernel indices:
+  // 0, 1, 0, 1, 0, 1, 0, 1, and
+  // 2, 3, 2, 3, 2, 3, 2, 3
+  __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]);
+  __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]);
+
+  __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1));
+
+  for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; ++i) {
+    const int y_w = y0 + i;
+    const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)];
+    int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE];
+
+    // Load this row of pixels.
+    // For an 8x8 patch, we need to load the 8 image pixels + 3 extras,
+    // for a total of 11 pixels. Here we load 16 pixels, but only use
+    // the first 11.
+    __m128i row = _mm_loadu_si128((__m128i *)ref_row);
+
+    // Expand pixels to int16s
+    __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row);
+    __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4));
+
+    // Relevant multiply instruction
+    // This multiplies pointwise, then sums in pairs.
+    //_mm_madd_epi16();
+
+    // Compute first four outputs
+    // input pixels 0, 1, 1, 2, 2, 3, 3, 4
+    // * kernel     0, 1, 0, 1, 0, 1, 0, 1
+    __m128i px0 =
+        _mm_unpacklo_epi16(px_0to7_i16, _mm_srli_si128(px_0to7_i16, 2));
+    // input pixels 2, 3, 3, 4, 4, 5, 5, 6
+    // * kernel     2, 3, 2, 3, 2, 3, 2, 3
+    __m128i px1 = _mm_unpacklo_epi16(_mm_srli_si128(px_0to7_i16, 4),
+                                     _mm_srli_si128(px_0to7_i16, 6));
+    // Convolve with kernel and sum 2x2 boxes to form first 4 outputs
+    __m128i sum0 = _mm_add_epi32(_mm_madd_epi16(px0, h_kernel_01),
+                                 _mm_madd_epi16(px1, h_kernel_23));
+
+    __m128i out0 = _mm_srai_epi32(_mm_add_epi32(sum0, round_const_h),
+                                  DISFLOW_INTERP_BITS - 6);
+
+    // Compute second four outputs
+    __m128i px2 =
+        _mm_unpacklo_epi16(px_4to10_i16, _mm_srli_si128(px_4to10_i16, 2));
+    __m128i px3 = _mm_unpacklo_epi16(_mm_srli_si128(px_4to10_i16, 4),
+                                     _mm_srli_si128(px_4to10_i16, 6));
+    __m128i sum1 = _mm_add_epi32(_mm_madd_epi16(px2, h_kernel_01),
+                                 _mm_madd_epi16(px3, h_kernel_23));
+
+    // Round by just enough bits that the result is
+    // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32
+    // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32
+    // as it does now
+    // This means shifting down so we have 6 extra bits, for a maximum value
+    // of +18360, which can occur if u_frac == 0.5 and the input pixels are
+    // {0, 255, 255, 0}.
+    __m128i out1 = _mm_srai_epi32(_mm_add_epi32(sum1, round_const_h),
+                                  DISFLOW_INTERP_BITS - 6);
+
+    _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1));
+
+#if CHECK_RESULTS && !defined(NDEBUG)
+    // Cross-check
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+      const int x_w = x0 + j;
+      int arr[4];
+
+      arr[0] = (int)ref[y_w * stride + (x_w - 1)];
+      arr[1] = (int)ref[y_w * stride + (x_w + 0)];
+      arr[2] = (int)ref[y_w * stride + (x_w + 1)];
+      arr[3] = (int)ref[y_w * stride + (x_w + 2)];
+
+      // Apply kernel and round, keeping 6 extra bits of precision.
+      //
+      // 6 is the maximum allowable number of extra bits which will avoid
+      // the intermediate values overflowing an int16_t. The most extreme
+      // intermediate value occurs when:
+      // * The input pixels are [0, 255, 255, 0]
+      // * u_frac = 0.5
+      // In this case, the un-scaled output is 255 * 1.125 = 286.875.
+      // As an integer with 6 fractional bits, that is 18360, which fits
+      // in an int16_t. But with 7 fractional bits it would be 36720,
+      // which is too large.
+      const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel),
+                                             DISFLOW_INTERP_BITS - 6);
+      (void)c_value;  // Suppress warnings
+      assert(tmp_row[j] == c_value);
+    }
+#endif  // CHECK_RESULTS
+  }
+
+  // Vertical convolution
+  const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2;
+  __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1));
+
+  __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]);
+  __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]);
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) {
+    int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE];
+
+    // Load 4 rows of 8 x 16-bit values
+    __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+    __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row);
+    __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+    __m128i px3 =
+        _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE));
+
+    // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... ,
+    // but each multiply expands its output to 32 bits. So we need to be
+    // a little clever about how we do this
+    __m128i sum0 = _mm_add_epi32(
+        _mm_madd_epi16(_mm_unpacklo_epi16(px0, px1), v_kernel_01),
+        _mm_madd_epi16(_mm_unpacklo_epi16(px2, px3), v_kernel_23));
+    __m128i sum1 = _mm_add_epi32(
+        _mm_madd_epi16(_mm_unpackhi_epi16(px0, px1), v_kernel_01),
+        _mm_madd_epi16(_mm_unpackhi_epi16(px2, px3), v_kernel_23));
+
+    __m128i sum0_rounded =
+        _mm_srai_epi32(_mm_add_epi32(sum0, round_const_v), round_bits);
+    __m128i sum1_rounded =
+        _mm_srai_epi32(_mm_add_epi32(sum1, round_const_v), round_bits);
+
+    __m128i warped = _mm_packs_epi32(sum0_rounded, sum1_rounded);
+    __m128i src_pixels_u8 =
+        _mm_loadl_epi64((__m128i *)&src[(y + i) * stride + x]);
+    __m128i src_pixels = _mm_slli_epi16(_mm_cvtepu8_epi16(src_pixels_u8), 3);
+
+    // Calculate delta from the target patch
+    __m128i dt = _mm_sub_epi16(warped, src_pixels);
+
+    // Load 8 elements each of dx and dt, to pair with the 8 elements of dt
+    // that we have just computed. Then compute 8 partial sums of dx * dt
+    // and dy * dt, implicitly sum to give 4 partial sums of each, and
+    // accumulate.
+    __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * DISFLOW_PATCH_SIZE]);
+    __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]);
+    b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt));
+    b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt));
+
+#if CHECK_RESULTS
+    int16_t dt_arr[8];
+    memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr));
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) {
+      int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j];
+      int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE],
+                     p[2 * DISFLOW_PATCH_SIZE] };
+      const int result = get_cubic_value_int(arr, v_kernel);
+
+      // Apply kernel and round.
+      // This time, we have to round off the 6 extra bits which were kept
+      // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits
+      // of precision to match the scale of the dx and dy arrays.
+      const int c_warped = ROUND_POWER_OF_TWO(result, round_bits);
+      const int c_src_px = src[(x + j) + (y + i) * stride] << 3;
+      const int c_dt = c_warped - c_src_px;
+
+      assert(dt_arr[j] == c_dt);
+
+      c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+      c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt;
+    }
+#endif  // CHECK_RESULTS
+  }
+
+  // Flatten the two sets of partial sums to find the final value of b
+  // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc).
+  // We need to do 6 additions in total; a `hadd` instruction can take care
+  // of four of them, leaving two scalar additions.
+  __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc);
+  b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1);
+  b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3);
+
+#if CHECK_RESULTS
+  assert(b[0] == c_result[0]);
+  assert(b[1] == c_result[1]);
+#endif  // CHECK_RESULTS
+}
+
+static INLINE void sobel_filter_x(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+  int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+#if CHECK_RESULTS
+  const int taps = 3;
+#endif  // CHECK_RESULTS
+
+  // Horizontal filter
+  // As the kernel is simply {1, 0, -1}, we implement this as simply
+  //  out[x] = image[x-1] - image[x+1]
+  // rather than doing a "proper" convolution operation
+  for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+    const uint8_t *src_row = src + y * src_stride;
+    int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+
+    // Load pixels and expand to 16 bits
+    __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1));
+    __m128i px0 = _mm_cvtepu8_epi16(row);
+    __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2));
+
+    __m128i out = _mm_sub_epi16(px0, px2);
+
+    // Store to intermediate array
+    _mm_storeu_si128((__m128i *)tmp_row, out);
+
+#if CHECK_RESULTS
+    // Cross-check
+    static const int16_t h_kernel[3] = { 1, 0, -1 };
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += h_kernel[k] * src_row[x + k - 1];
+      }
+      (void)sum;
+      assert(tmp_row[x] == sum);
+    }
+#endif  // CHECK_RESULTS
+  }
+
+  // Vertical filter
+  // Here the kernel is {1, 2, 1}, which can be implemented
+  // with simple sums rather than multiplies and adds.
+  // In order to minimize dependency chains, we evaluate in the order
+  // (image[y - 1] + image[y + 1]) + (image[y] << 1)
+  // This way, the first addition and the shift can happen in parallel
+  for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+    const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+    int16_t *dst_row = dst + y * dst_stride;
+
+    __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+    __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row);
+    __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+
+    __m128i out =
+        _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1));
+
+    _mm_storeu_si128((__m128i *)dst_row, out);
+
+#if CHECK_RESULTS
+    static const int16_t v_kernel[3] = { 1, 2, 1 };
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+      }
+      (void)sum;
+      assert(dst_row[x] == sum);
+    }
+#endif  // CHECK_RESULTS
+  }
+}
+
+static INLINE void sobel_filter_y(const uint8_t *src, int src_stride,
+                                  int16_t *dst, int dst_stride) {
+  int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)];
+  int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE;
+#if CHECK_RESULTS
+  const int taps = 3;
+#endif  // CHECK_RESULTS
+
+  // Horizontal filter
+  // Here the kernel is {1, 2, 1}, which can be implemented
+  // with simple sums rather than multiplies and adds.
+  // In order to minimize dependency chains, we evaluate in the order
+  // (image[y - 1] + image[y + 1]) + (image[y] << 1)
+  // This way, the first addition and the shift can happen in parallel
+  for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) {
+    const uint8_t *src_row = src + y * src_stride;
+    int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+
+    // Load pixels and expand to 16 bits
+    __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1));
+    __m128i px0 = _mm_cvtepu8_epi16(row);
+    __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1));
+    __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2));
+
+    __m128i out =
+        _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1));
+
+    // Store to intermediate array
+    _mm_storeu_si128((__m128i *)tmp_row, out);
+
+#if CHECK_RESULTS
+    // Cross-check
+    static const int16_t h_kernel[3] = { 1, 2, 1 };
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += h_kernel[k] * src_row[x + k - 1];
+      }
+      (void)sum;
+      assert(tmp_row[x] == sum);
+    }
+#endif  // CHECK_RESULTS
+  }
+
+  // Vertical filter
+  // As the kernel is simply {1, 0, -1}, we implement this as simply
+  //  out[x] = image[x-1] - image[x+1]
+  // rather than doing a "proper" convolution operation
+  for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) {
+    const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE;
+    int16_t *dst_row = dst + y * dst_stride;
+
+    __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE));
+    __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE));
+
+    __m128i out = _mm_sub_epi16(px0, px2);
+
+    _mm_storeu_si128((__m128i *)dst_row, out);
+
+#if CHECK_RESULTS
+    static const int16_t v_kernel[3] = { 1, 0, -1 };
+    for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) {
+      int sum = 0;
+      for (int k = 0; k < taps; ++k) {
+        sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x];
+      }
+      (void)sum;
+      assert(dst_row[x] == sum);
+    }
+#endif  // CHECK_RESULTS
+  }
+}
+
+static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride,
+                                       const int16_t *dy, int dy_stride,
+                                       double *M) {
+  __m128i acc[4] = { 0 };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    __m128i dx_row = _mm_loadu_si128((__m128i *)&dx[i * dx_stride]);
+    __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * dy_stride]);
+
+    acc[0] = _mm_add_epi32(acc[0], _mm_madd_epi16(dx_row, dx_row));
+    acc[1] = _mm_add_epi32(acc[1], _mm_madd_epi16(dx_row, dy_row));
+    // Don't compute acc[2], as it should be equal to acc[1]
+    acc[3] = _mm_add_epi32(acc[3], _mm_madd_epi16(dy_row, dy_row));
+  }
+
+  // Condense sums
+  __m128i partial_sum_0 = _mm_hadd_epi32(acc[0], acc[1]);
+  __m128i partial_sum_1 = _mm_hadd_epi32(acc[1], acc[3]);
+  __m128i result = _mm_hadd_epi32(partial_sum_0, partial_sum_1);
+
+  // Apply regularization
+  // We follow the standard regularization method of adding `k * I` before
+  // inverting. This ensures that the matrix will be invertible.
+  //
+  // Setting the regularization strength k to 1 seems to work well here, as
+  // typical values coming from the other equations are very large (1e5 to
+  // 1e6, with an upper limit of around 6e7, at the time of writing).
+  // It also preserves the property that all matrix values are whole numbers,
+  // which is convenient for integerized SIMD implementation.
+  result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1));
+
+#if CHECK_RESULTS
+  int tmp[4] = { 0 };
+
+  for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) {
+    for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) {
+      tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j];
+      tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j];
+      // Don't compute tmp[2], as it should be equal to tmp[1]
+      tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j];
+    }
+  }
+
+  // Apply regularization
+  tmp[0] += 1;
+  tmp[3] += 1;
+
+  tmp[2] = tmp[1];
+
+  assert(tmp[0] == _mm_extract_epi32(result, 0));
+  assert(tmp[1] == _mm_extract_epi32(result, 1));
+  assert(tmp[2] == _mm_extract_epi32(result, 2));
+  assert(tmp[3] == _mm_extract_epi32(result, 3));
+#endif  // CHECK_RESULTS
+
+  // Convert results to doubles and store
+  _mm_storeu_pd(M, _mm_cvtepi32_pd(result));
+  _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8)));
+}
+
+// Try to invert the matrix M
+// Note: Due to the nature of how a least-squares matrix is constructed, all of
+// the eigenvalues will be >= 0, and therefore det M >= 0 as well.
+// The regularization term `+ k * I` further ensures that det M >= k^2.
+// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1.
+// So we don't have to worry about non-invertible matrices here.
+static INLINE void invert_2x2(const double *M, double *M_inv) {
+  double det = (M[0] * M[3]) - (M[1] * M[2]);
+  assert(det >= 1);
+  const double det_inv = 1 / det;
+
+  M_inv[0] = M[3] * det_inv;
+  M_inv[1] = -M[1] * det_inv;
+  M_inv[2] = -M[2] * det_inv;
+  M_inv[3] = M[0] * det_inv;
+}
+
+void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref,
+                                      int x, int y, int width, int height,
+                                      int stride, double *u, double *v) {
+  double M[4];
+  double M_inv[4];
+  int b[2];
+  int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+  int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE];
+
+  // Compute gradients within this patch
+  const uint8_t *src_patch = &src[y * stride + x];
+  sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE);
+  sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE);
+
+  compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M);
+  invert_2x2(M, M_inv);
+
+  for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) {
+    compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy,
+                        b);
+
+    // Solve flow equations to find a better estimate for the flow vector
+    // at this point
+    const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1];
+    const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1];
+    *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2);
+    *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2);
+
+    if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) {
+      // Stop iteration when we're close to convergence
+      break;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/fwd_txfm.c b/third_party/aom/aom_dsp/fwd_txfm.c
new file mode 100644
index 0000000000..5503501d62
--- /dev/null
+++ b/third_party/aom/aom_dsp/fwd_txfm.c
@@ -0,0 +1,245 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include "aom_dsp/txfm_common.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows.
+  // We need an intermediate buffer between passes.
+  tran_low_t intermediate[4 * 4];
+  const tran_low_t *in_low = NULL;
+  tran_low_t *out = intermediate;
+  // Do the two transform passes
+  for (int pass = 0; pass < 2; ++pass) {
+    tran_high_t in_high[4];  // canbe16
+    tran_high_t step[4];     // canbe16
+    tran_low_t temp[4];
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+        ++input;  // Next column
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;  // Next column (which is a transposed row)
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp[0] = (tran_low_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
+      temp[2] = (tran_low_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
+      temp[1] = (tran_low_t)fdct_round_shift(step[2] * cospi_24_64 +
+                                             step[3] * cospi_8_64);
+      temp[3] = (tran_low_t)fdct_round_shift(-step[2] * cospi_8_64 +
+                                             step[3] * cospi_24_64);
+      // Only transpose the first pass.
+      if (pass == 0) {
+        out[0] = temp[0];
+        out[1] = temp[1];
+        out[2] = temp[2];
+        out[3] = temp[3];
+        out += 4;
+      } else {
+        out[0 * 4] = temp[0];
+        out[1 * 4] = temp[1];
+        out[2 * 4] = temp[2];
+        out[3 * 4] = temp[3];
+        ++out;
+      }
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
+void aom_fdct4x4_lp_c(const int16_t *input, int16_t *output, int stride) {
+  // The 2D transform is done with two passes which are actually pretty
+  // similar. In the first one, we transform the columns and transpose
+  // the results. In the second one, we transform the rows.
+  // We need an intermediate buffer between passes.
+  int16_t intermediate[4 * 4];
+  const int16_t *in_low = NULL;
+  int16_t *out = intermediate;
+  // Do the two transform passes
+  for (int pass = 0; pass < 2; ++pass) {
+    int32_t in_high[4];  // canbe16
+    int32_t step[4];     // canbe16
+    int16_t temp[4];
+    for (int i = 0; i < 4; ++i) {
+      // Load inputs.
+      if (pass == 0) {
+        in_high[0] = input[0 * stride] * 16;
+        in_high[1] = input[1 * stride] * 16;
+        in_high[2] = input[2 * stride] * 16;
+        in_high[3] = input[3 * stride] * 16;
+        ++input;
+        if (i == 0 && in_high[0]) {
+          ++in_high[0];
+        }
+      } else {
+        assert(in_low != NULL);
+        in_high[0] = in_low[0 * 4];
+        in_high[1] = in_low[1 * 4];
+        in_high[2] = in_low[2 * 4];
+        in_high[3] = in_low[3 * 4];
+        ++in_low;
+      }
+      // Transform.
+      step[0] = in_high[0] + in_high[3];
+      step[1] = in_high[1] + in_high[2];
+      step[2] = in_high[1] - in_high[2];
+      step[3] = in_high[0] - in_high[3];
+      temp[0] = (int16_t)fdct_round_shift((step[0] + step[1]) * cospi_16_64);
+      temp[2] = (int16_t)fdct_round_shift((step[0] - step[1]) * cospi_16_64);
+      temp[1] = (int16_t)fdct_round_shift(step[2] * cospi_24_64 +
+                                          step[3] * cospi_8_64);
+      temp[3] = (int16_t)fdct_round_shift(-step[2] * cospi_8_64 +
+                                          step[3] * cospi_24_64);
+      // Only transpose the first pass.
+      if (pass == 0) {
+        out[0] = temp[0];
+        out[1] = temp[1];
+        out[2] = temp[2];
+        out[3] = temp[3];
+        out += 4;
+      } else {
+        out[0 * 4] = temp[0];
+        out[1 * 4] = temp[1];
+        out[2 * 4] = temp[2];
+        out[3 * 4] = temp[3];
+        ++out;
+      }
+    }
+    // Setup in/out for next pass.
+    in_low = intermediate;
+    out = output;
+  }
+
+  for (int i = 0; i < 4; ++i) {
+    for (int j = 0; j < 4; ++j)
+      output[j + i * 4] = (output[j + i * 4] + 1) >> 2;
+  }
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
+  int i, j;
+  tran_low_t intermediate[64];
+  int pass;
+  tran_low_t *output = intermediate;
+  const tran_low_t *in = NULL;
+
+  // Transform columns
+  for (pass = 0; pass < 2; ++pass) {
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
+
+    for (i = 0; i < 8; i++) {
+      // stage 1
+      if (pass == 0) {
+        s0 = (input[0 * stride] + input[7 * stride]) * 4;
+        s1 = (input[1 * stride] + input[6 * stride]) * 4;
+        s2 = (input[2 * stride] + input[5 * stride]) * 4;
+        s3 = (input[3 * stride] + input[4 * stride]) * 4;
+        s4 = (input[3 * stride] - input[4 * stride]) * 4;
+        s5 = (input[2 * stride] - input[5 * stride]) * 4;
+        s6 = (input[1 * stride] - input[6 * stride]) * 4;
+        s7 = (input[0 * stride] - input[7 * stride]) * 4;
+        ++input;
+      } else {
+        s0 = in[0 * 8] + in[7 * 8];
+        s1 = in[1 * 8] + in[6 * 8];
+        s2 = in[2 * 8] + in[5 * 8];
+        s3 = in[3 * 8] + in[4 * 8];
+        s4 = in[3 * 8] - in[4 * 8];
+        s5 = in[2 * 8] - in[5 * 8];
+        s6 = in[1 * 8] - in[6 * 8];
+        s7 = in[0 * 8] - in[7 * 8];
+        ++in;
+      }
+
+      // fdct4(step, step);
+      x0 = s0 + s3;
+      x1 = s1 + s2;
+      x2 = s1 - s2;
+      x3 = s0 - s3;
+      t0 = (x0 + x1) * cospi_16_64;
+      t1 = (x0 - x1) * cospi_16_64;
+      t2 = x2 * cospi_24_64 + x3 * cospi_8_64;
+      t3 = -x2 * cospi_8_64 + x3 * cospi_24_64;
+      output[0] = (tran_low_t)fdct_round_shift(t0);
+      output[2] = (tran_low_t)fdct_round_shift(t2);
+      output[4] = (tran_low_t)fdct_round_shift(t1);
+      output[6] = (tran_low_t)fdct_round_shift(t3);
+
+      // Stage 2
+      t0 = (s6 - s5) * cospi_16_64;
+      t1 = (s6 + s5) * cospi_16_64;
+      t2 = fdct_round_shift(t0);
+      t3 = fdct_round_shift(t1);
+
+      // Stage 3
+      x0 = s4 + t2;
+      x1 = s4 - t2;
+      x2 = s7 - t3;
+      x3 = s7 + t3;
+
+      // Stage 4
+      t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
+      t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
+      t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
+      t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
+      output[1] = (tran_low_t)fdct_round_shift(t0);
+      output[3] = (tran_low_t)fdct_round_shift(t2);
+      output[5] = (tran_low_t)fdct_round_shift(t1);
+      output[7] = (tran_low_t)fdct_round_shift(t3);
+      output += 8;
+    }
+    in = intermediate;
+    output = final_output;
+  }
+
+  // Rows
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j) final_output[j + i * 8] /= 2;
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+#if CONFIG_AV1_HIGHBITDEPTH && CONFIG_INTERNAL_STATS
+void aom_highbd_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  aom_fdct8x8_c(input, final_output, stride);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/grain_params.h b/third_party/aom/aom_dsp/grain_params.h
new file mode 100644
index 0000000000..5a28afc2a1
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_params.h
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief Describes film grain parameters
+ *
+ */
+#ifndef AOM_AOM_DSP_GRAIN_PARAMS_H_
+#define AOM_AOM_DSP_GRAIN_PARAMS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+/*!\brief Structure containing film grain synthesis parameters for a frame
+ *
+ * This structure contains input parameters for film grain synthesis
+ */
+typedef struct {
+  // This structure is compared element-by-element in the function
+  // aom_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
+  int apply_grain;
+
+  int update_parameters;
+
+  // 8 bit values
+  int scaling_points_y[14][2];
+  int num_y_points;  // value: 0..14
+
+  // 8 bit values
+  int scaling_points_cb[10][2];
+  int num_cb_points;  // value: 0..10
+
+  // 8 bit values
+  int scaling_points_cr[10][2];
+  int num_cr_points;  // value: 0..10
+
+  int scaling_shift;  // values : 8..11
+
+  int ar_coeff_lag;  // values:  0..3
+
+  // 8 bit values
+  int ar_coeffs_y[24];
+  int ar_coeffs_cb[25];
+  int ar_coeffs_cr[25];
+
+  // Shift value: AR coeffs range
+  // 6: [-2, 2)
+  // 7: [-1, 1)
+  // 8: [-0.5, 0.5)
+  // 9: [-0.25, 0.25)
+  int ar_coeff_shift;  // values : 6..9
+
+  int cb_mult;       // 8 bits
+  int cb_luma_mult;  // 8 bits
+  int cb_offset;     // 9 bits
+
+  int cr_mult;       // 8 bits
+  int cr_luma_mult;  // 8 bits
+  int cr_offset;     // 9 bits
+
+  int overlap_flag;
+
+  int clip_to_restricted_range;
+
+  unsigned int bit_depth;  // video bit depth
+
+  int chroma_scaling_from_luma;
+
+  int grain_scale_shift;
+
+  uint16_t random_seed;
+  // This structure is compared element-by-element in the function
+  // aom_check_grain_params_equiv: this function must be updated if any changes
+  // are made to this structure.
+} aom_film_grain_t;
+
+/*!\brief Check if two film grain parameters structs are equivalent
+ *
+ * Check if two film grain parameters are equal, except for the
+ * update_parameters and random_seed elements which are ignored.
+ *
+ * \param[in]    pa               The first set of parameters to compare
+ * \param[in]    pb               The second set of parameters to compare
+ * \return       Returns 1 if the params are equivalent, 0 otherwise
+ */
+static INLINE int aom_check_grain_params_equiv(
+    const aom_film_grain_t *const pa, const aom_film_grain_t *const pb) {
+  if (pa->apply_grain != pb->apply_grain) return 0;
+  // Don't compare update_parameters
+
+  if (pa->num_y_points != pb->num_y_points) return 0;
+  if (memcmp(pa->scaling_points_y, pb->scaling_points_y,
+             pa->num_y_points * 2 * sizeof(*pa->scaling_points_y)) != 0)
+    return 0;
+
+  if (pa->num_cb_points != pb->num_cb_points) return 0;
+  if (memcmp(pa->scaling_points_cb, pb->scaling_points_cb,
+             pa->num_cb_points * 2 * sizeof(*pa->scaling_points_cb)) != 0)
+    return 0;
+
+  if (pa->num_cr_points != pb->num_cr_points) return 0;
+  if (memcmp(pa->scaling_points_cr, pb->scaling_points_cr,
+             pa->num_cr_points * 2 * sizeof(*pa->scaling_points_cr)) != 0)
+    return 0;
+
+  if (pa->scaling_shift != pb->scaling_shift) return 0;
+  if (pa->ar_coeff_lag != pb->ar_coeff_lag) return 0;
+
+  const int num_pos = 2 * pa->ar_coeff_lag * (pa->ar_coeff_lag + 1);
+  if (memcmp(pa->ar_coeffs_y, pb->ar_coeffs_y,
+             num_pos * sizeof(*pa->ar_coeffs_y)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cb, pb->ar_coeffs_cb,
+             num_pos * sizeof(*pa->ar_coeffs_cb)) != 0)
+    return 0;
+  if (memcmp(pa->ar_coeffs_cr, pb->ar_coeffs_cr,
+             num_pos * sizeof(*pa->ar_coeffs_cr)) != 0)
+    return 0;
+
+  if (pa->ar_coeff_shift != pb->ar_coeff_shift) return 0;
+
+  if (pa->cb_mult != pb->cb_mult) return 0;
+  if (pa->cb_luma_mult != pb->cb_luma_mult) return 0;
+  if (pa->cb_offset != pb->cb_offset) return 0;
+
+  if (pa->cr_mult != pb->cr_mult) return 0;
+  if (pa->cr_luma_mult != pb->cr_luma_mult) return 0;
+  if (pa->cr_offset != pb->cr_offset) return 0;
+
+  if (pa->overlap_flag != pb->overlap_flag) return 0;
+  if (pa->clip_to_restricted_range != pb->clip_to_restricted_range) return 0;
+  if (pa->bit_depth != pb->bit_depth) return 0;
+  if (pa->chroma_scaling_from_luma != pb->chroma_scaling_from_luma) return 0;
+  if (pa->grain_scale_shift != pb->grain_scale_shift) return 0;
+
+  return 1;
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_PARAMS_H_
diff --git a/third_party/aom/aom_dsp/grain_table.c b/third_party/aom/aom_dsp/grain_table.c
new file mode 100644
index 0000000000..3505f9f2c8
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief This file has the implementation details of the grain table.
+ *
+ * The file format is an ascii representation for readability and
+ * editability. Array parameters are separated from the non-array
+ * parameters and prefixed with a few characters to make for easy
+ * localization with a parameter set. Each entry is prefixed with "E"
+ * and the other parameters are only specified if "update-parms" is
+ * non-zero.
+ *
+ * filmgrn1
+ * E <start-time> <end-time> <apply-grain> <random-seed> <update-parms>
+ *  p <ar_coeff_lag> <ar_coeff_shift> <grain_scale_shift> ...
+ *  sY <num_y_points> <point_0_x> <point_0_y> ...
+ *  sCb <num_cb_points> <point_0_x> <point_0_y> ...
+ *  sCr <num_cr_points> <point_0_x> <point_0_y> ...
+ *  cY <ar_coeff_y_0> ....
+ *  cCb <ar_coeff_cb_0> ....
+ *  cCr <ar_coeff_cr_0> ....
+ * E <start-time> ...
+ */
+#include <string.h>
+#include <stdio.h>
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/grain_table.h"
+#include "aom_mem/aom_mem.h"
+
+static const char kFileMagic[8] = "filmgrn1";
+
+static void grain_table_entry_read(FILE *file,
+                                   struct aom_internal_error_info *error_info,
+                                   aom_film_grain_table_entry_t *entry) {
+  aom_film_grain_t *pars = &entry->params;
+  int num_read =
+      fscanf(file, "E %" PRId64 " %" PRId64 " %d %hd %d\n", &entry->start_time,
+             &entry->end_time, &pars->apply_grain, &pars->random_seed,
+             &pars->update_parameters);
+  if (num_read == 0 && feof(file)) return;
+  if (num_read != 5) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read entry header. Read %d != 5", num_read);
+    return;
+  }
+  if (pars->update_parameters) {
+    num_read = fscanf(file, "p %d %d %d %d %d %d %d %d %d %d %d %d\n",
+                      &pars->ar_coeff_lag, &pars->ar_coeff_shift,
+                      &pars->grain_scale_shift, &pars->scaling_shift,
+                      &pars->chroma_scaling_from_luma, &pars->overlap_flag,
+                      &pars->cb_mult, &pars->cb_luma_mult, &pars->cb_offset,
+                      &pars->cr_mult, &pars->cr_luma_mult, &pars->cr_offset);
+    if (num_read != 12) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read entry params. Read %d != 12",
+                         num_read);
+      return;
+    }
+    if (!fscanf(file, "\tsY %d ", &pars->num_y_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num y points");
+      return;
+    }
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_y[i][0],
+                      &pars->scaling_points_y[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read y scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCb %d", &pars->num_cb_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cb points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cb[i][0],
+                      &pars->scaling_points_cb[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cb scaling points");
+        return;
+      }
+    }
+    if (!fscanf(file, "\n\tsCr %d", &pars->num_cr_points)) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read num cr points");
+      return;
+    }
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      if (2 != fscanf(file, "%d %d", &pars->scaling_points_cr[i][0],
+                      &pars->scaling_points_cr[i][1])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read cr scaling points");
+        return;
+      }
+    }
+
+    if (fscanf(file, "\n\tcY")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read Y coeffs header (cY)");
+      return;
+    }
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_y[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Y coeffs");
+        return;
+      }
+    }
+    if (fscanf(file, "\n\tcCb")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable to read Cb coeffs header (cCb)");
+      return;
+    }
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cb[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cb coeffs");
+        return;
+      }
+    }
+    if (fscanf(file, "\n\tcCr")) {
+      aom_internal_error(error_info, AOM_CODEC_ERROR,
+                         "Unable read to Cr coeffs header (cCr)");
+      return;
+    }
+    for (int i = 0; i <= n; ++i) {
+      if (1 != fscanf(file, "%d", &pars->ar_coeffs_cr[i])) {
+        aom_internal_error(error_info, AOM_CODEC_ERROR,
+                           "Unable to read Cr coeffs");
+        return;
+      }
+    }
+    (void)fscanf(file, "\n");
+  }
+}
+
+static void grain_table_entry_write(FILE *file,
+                                    aom_film_grain_table_entry_t *entry) {
+  const aom_film_grain_t *pars = &entry->params;
+  fprintf(file, "E %" PRId64 " %" PRId64 " %d %d %d\n", entry->start_time,
+          entry->end_time, pars->apply_grain, pars->random_seed,
+          pars->update_parameters);
+  if (pars->update_parameters) {
+    fprintf(file, "\tp %d %d %d %d %d %d %d %d %d %d %d %d\n",
+            pars->ar_coeff_lag, pars->ar_coeff_shift, pars->grain_scale_shift,
+            pars->scaling_shift, pars->chroma_scaling_from_luma,
+            pars->overlap_flag, pars->cb_mult, pars->cb_luma_mult,
+            pars->cb_offset, pars->cr_mult, pars->cr_luma_mult,
+            pars->cr_offset);
+    fprintf(file, "\tsY %d ", pars->num_y_points);
+    for (int i = 0; i < pars->num_y_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_y[i][0],
+              pars->scaling_points_y[i][1]);
+    }
+    fprintf(file, "\n\tsCb %d", pars->num_cb_points);
+    for (int i = 0; i < pars->num_cb_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cb[i][0],
+              pars->scaling_points_cb[i][1]);
+    }
+    fprintf(file, "\n\tsCr %d", pars->num_cr_points);
+    for (int i = 0; i < pars->num_cr_points; ++i) {
+      fprintf(file, " %d %d", pars->scaling_points_cr[i][0],
+              pars->scaling_points_cr[i][1]);
+    }
+    fprintf(file, "\n\tcY");
+    const int n = 2 * pars->ar_coeff_lag * (pars->ar_coeff_lag + 1);
+    for (int i = 0; i < n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_y[i]);
+    }
+    fprintf(file, "\n\tcCb");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cb[i]);
+    }
+    fprintf(file, "\n\tcCr");
+    for (int i = 0; i <= n; ++i) {
+      fprintf(file, " %d", pars->ar_coeffs_cr[i]);
+    }
+    fprintf(file, "\n");
+  }
+}
+
+// TODO(https://crbug.com/aomedia/3228): Update this function to return an
+// integer status.
+void aom_film_grain_table_append(aom_film_grain_table_t *t, int64_t time_stamp,
+                                 int64_t end_time,
+                                 const aom_film_grain_t *grain) {
+  if (!t->tail || memcmp(grain, &t->tail->params, sizeof(*grain))) {
+    aom_film_grain_table_entry_t *new_tail = aom_malloc(sizeof(*new_tail));
+    if (!new_tail) return;
+    memset(new_tail, 0, sizeof(*new_tail));
+    if (t->tail) t->tail->next = new_tail;
+    if (!t->head) t->head = new_tail;
+    t->tail = new_tail;
+
+    new_tail->start_time = time_stamp;
+    new_tail->end_time = end_time;
+    new_tail->params = *grain;
+  } else {
+    t->tail->end_time = AOMMAX(t->tail->end_time, end_time);
+    t->tail->start_time = AOMMIN(t->tail->start_time, time_stamp);
+  }
+}
+
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  aom_film_grain_table_entry_t *prev_entry = NULL;
+  uint16_t random_seed = grain ? grain->random_seed : 0;
+  if (grain) memset(grain, 0, sizeof(*grain));
+
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    if (time_stamp >= entry->start_time && time_stamp < entry->end_time) {
+      if (grain) {
+        *grain = entry->params;
+        if (time_stamp != 0) grain->random_seed = random_seed;
+      }
+      if (!erase) return 1;
+
+      const int64_t entry_end_time = entry->end_time;
+      if (time_stamp <= entry->start_time && end_time >= entry->end_time) {
+        if (t->tail == entry) t->tail = prev_entry;
+        if (prev_entry) {
+          prev_entry->next = entry->next;
+        } else {
+          t->head = entry->next;
+        }
+        aom_free(entry);
+      } else if (time_stamp <= entry->start_time &&
+                 end_time < entry->end_time) {
+        entry->start_time = end_time;
+      } else if (time_stamp > entry->start_time &&
+                 end_time >= entry->end_time) {
+        entry->end_time = time_stamp;
+      } else {
+        aom_film_grain_table_entry_t *new_entry =
+            aom_malloc(sizeof(*new_entry));
+        if (!new_entry) return 0;
+        new_entry->next = entry->next;
+        new_entry->start_time = end_time;
+        new_entry->end_time = entry->end_time;
+        new_entry->params = entry->params;
+        entry->next = new_entry;
+        entry->end_time = time_stamp;
+        if (t->tail == entry) t->tail = new_entry;
+      }
+      // If segments aren't aligned, delete from the beginning of subsequent
+      // segments
+      if (end_time > entry_end_time) {
+        // Ignoring the return value here is safe since we're erasing from the
+        // beginning of subsequent entries.
+        aom_film_grain_table_lookup(t, entry_end_time, end_time, /*erase=*/1,
+                                    NULL);
+      }
+      return 1;
+    }
+    prev_entry = entry;
+    entry = next;
+  }
+  return 0;
+}
+
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  FILE *file = fopen(filename, "rb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open %s",
+                       filename);
+    return error_info->error_code;
+  }
+  error_info->error_code = AOM_CODEC_OK;
+
+  // Read in one extra character as there should be white space after
+  // the header.
+  char magic[9];
+  if (!fread(magic, 9, 1, file) || memcmp(magic, kFileMagic, 8)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to read (or invalid) file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  aom_film_grain_table_entry_t *prev_entry = NULL;
+  while (!feof(file)) {
+    aom_film_grain_table_entry_t *entry = aom_malloc(sizeof(*entry));
+    if (!entry) {
+      aom_internal_error(error_info, AOM_CODEC_MEM_ERROR,
+                         "Unable to allocate grain table entry");
+      break;
+    }
+    memset(entry, 0, sizeof(*entry));
+    grain_table_entry_read(file, error_info, entry);
+    entry->next = NULL;
+
+    if (prev_entry) prev_entry->next = entry;
+    if (!t->head) t->head = entry;
+    t->tail = entry;
+    prev_entry = entry;
+
+    if (error_info->error_code != AOM_CODEC_OK) break;
+  }
+
+  fclose(file);
+  return error_info->error_code;
+}
+
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info) {
+  error_info->error_code = AOM_CODEC_OK;
+
+  FILE *file = fopen(filename, "wb");
+  if (!file) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR, "Unable to open file %s",
+                       filename);
+    return error_info->error_code;
+  }
+
+  if (!fwrite(kFileMagic, 8, 1, file)) {
+    aom_internal_error(error_info, AOM_CODEC_ERROR,
+                       "Unable to write file magic");
+    fclose(file);
+    return error_info->error_code;
+  }
+
+  fprintf(file, "\n");
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    grain_table_entry_write(file, entry);
+    entry = entry->next;
+  }
+  fclose(file);
+  return error_info->error_code;
+}
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t) {
+  aom_film_grain_table_entry_t *entry = t->head;
+  while (entry) {
+    aom_film_grain_table_entry_t *next = entry->next;
+    aom_free(entry);
+    entry = next;
+  }
+  memset(t, 0, sizeof(*t));
+}
diff --git a/third_party/aom/aom_dsp/grain_table.h b/third_party/aom/aom_dsp/grain_table.h
new file mode 100644
index 0000000000..49e84980ee
--- /dev/null
+++ b/third_party/aom/aom_dsp/grain_table.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*!\file
+ * \brief A table mapping from time to corresponding film grain parameters.
+ *
+ * In order to apply grain synthesis in the decoder, the film grain parameters
+ * need to be signalled in the encoder. The film grain parameters are time
+ * varying, and for two-pass encoding (and denoiser implementation flexibility)
+ * it is common to denoise the video and do parameter estimation before encoding
+ * the denoised video.
+ *
+ * The film grain table is used to provide this flexibility and is used as a
+ * parameter that is passed to the encoder.
+ *
+ * Further, if regraining is to be done in say a single pass mode, or in two
+ * pass within the encoder (before frames are added to the lookahead buffer),
+ * this data structure can be used to keep track of on-the-fly estimated grain
+ * parameters, that are then extracted from the table before the encoded frame
+ * is written.
+ */
+#ifndef AOM_AOM_DSP_GRAIN_TABLE_H_
+#define AOM_AOM_DSP_GRAIN_TABLE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "aom_dsp/grain_params.h"
+#include "aom/internal/aom_codec_internal.h"
+
+typedef struct aom_film_grain_table_entry_t {
+  aom_film_grain_t params;
+  int64_t start_time;
+  int64_t end_time;
+  struct aom_film_grain_table_entry_t *next;
+} aom_film_grain_table_entry_t;
+
+typedef struct {
+  aom_film_grain_table_entry_t *head;
+  aom_film_grain_table_entry_t *tail;
+} aom_film_grain_table_t;
+
+/*!\brief Add a mapping from [time_stamp, end_time) to the given grain
+ * parameters
+ *
+ * \param[in,out] table      The grain table
+ * \param[in]     time_stamp The start time stamp
+ * \param[in]     end_stamp  The end time_stamp
+ * \param[in]     grain      The grain parameters
+ */
+void aom_film_grain_table_append(aom_film_grain_table_t *table,
+                                 int64_t time_stamp, int64_t end_time,
+                                 const aom_film_grain_t *grain);
+
+/*!\brief Look-up (and optionally erase) the grain parameters for the given time
+ *
+ * \param[in]  table      The grain table
+ * \param[in]  time_stamp The start time stamp
+ * \param[in]  end_stamp  The end time_stamp
+ * \param[in]  erase      Whether the time segment can be deleted
+ * \param[out] grain      The output grain parameters
+ */
+int aom_film_grain_table_lookup(aom_film_grain_table_t *t, int64_t time_stamp,
+                                int64_t end_time, int erase,
+                                aom_film_grain_t *grain);
+
+/*!\brief Reads the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_read(
+    aom_film_grain_table_t *table, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+/*!\brief Writes the grain table from a file.
+ *
+ * \param[out]  table       The grain table
+ * \param[in]   filename    The file to read from
+ * \param[in]   error_info  Error info for tracking errors
+ */
+aom_codec_err_t aom_film_grain_table_write(
+    const aom_film_grain_table_t *t, const char *filename,
+    struct aom_internal_error_info *error_info);
+
+void aom_film_grain_table_free(aom_film_grain_table_t *t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_GRAIN_TABLE_H_
diff --git a/third_party/aom/aom_dsp/intrapred.c b/third_party/aom/aom_dsp/intrapred.c
new file mode 100644
index 0000000000..6ec091f5f3
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred.c
@@ -0,0 +1,793 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/intrapred_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+
+static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left) {
+  int r;
+  (void)above;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE int abs_diff(int a, int b) { return (a > b) ? a - b : b - a; }
+
+static INLINE uint16_t paeth_predictor_single(uint16_t left, uint16_t top,
+                                              uint16_t top_left) {
+  const int base = top + left - top_left;
+  const int p_left = abs_diff(base, left);
+  const int p_top = abs_diff(base, top);
+  const int p_top_left = abs_diff(base, top_left);
+
+  // Return nearest to base of left, top and top_left.
+  return (p_left <= p_top && p_left <= p_top_left) ? left
+         : (p_top <= p_top_left)                   ? top
+                                                   : top_left;
+}
+
+static INLINE void paeth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                   int bh, const uint8_t *above,
+                                   const uint8_t *left) {
+  int r, c;
+  const uint8_t ytop_left = above[-1];
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = (uint8_t)paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+// Some basic checks on weights for smooth predictor.
+#define sm_weights_sanity_checks(weights_w, weights_h, weights_scale, \
+                                 pred_scale)                          \
+  assert(weights_w[0] < weights_scale);                               \
+  assert(weights_h[0] < weights_scale);                               \
+  assert(weights_scale - weights_w[bw - 1] < weights_scale);          \
+  assert(weights_scale - weights_h[bh - 1] < weights_scale);          \
+  assert(pred_scale < 31)  // ensures no overflow when calculating predictor.
+
+#define divide_round(value, bits) (((value) + (1 << ((bits)-1))) >> (bits))
+
+static INLINE void smooth_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_v_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = smooth_weights + bh - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void smooth_h_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint8_t *above,
+                                      const uint8_t *left) {
+  const uint8_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = smooth_weights + bw - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint8_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, 128, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bw,
+                                    int bh, const uint8_t *above,
+                                    const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                const uint8_t *above, const uint8_t *left) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier, int shift2) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> shift2;
+}
+
+// The constants (multiplier and shifts) for a given block size are obtained
+// as follows:
+// - Let sum_w_h =  block width + block height.
+// - Shift 'sum_w_h' right until we reach an odd number. Let the number of
+// shifts for that block size be called 'shift1' (see the parameter in
+// dc_predictor_rect() function), and let the odd number be 'd'. [d has only 2
+// possible values: d = 3 for a 1:2 rect block and d = 5 for a 1:4 rect
+// block].
+// - Find multipliers for (i) dividing by 3, and (ii) dividing by 5,
+// using the "Algorithm 1" in:
+// http://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=1467632
+// by ensuring that m + n = 16 (in that algorithm). This ensures that our 2nd
+// shift will be 16, regardless of the block size.
+
+// Note: For low bitdepth, assembly code may be optimized by using smaller
+// constants for smaller block sizes, where the range of the 'sum' is
+// restricted to fewer bits.
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE void dc_predictor_rect(uint8_t *dst, ptrdiff_t stride, int bw,
+                                     int bh, const uint8_t *above,
+                                     const uint8_t *left, int shift1,
+                                     int multiplier) {
+  int sum = 0;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2);
+  assert(expected_dc < (1 << 8));
+
+  for (int r = 0; r < bh; r++) {
+    memset(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef DC_SHIFT2
+
+void aom_dc_predictor_4x8_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 8, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x4_c(uint8_t *dst, ptrdiff_t stride,
+                            const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 4, above, left, 2, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_4x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 4, 16, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x4_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 4, above, left, 2, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_8x16_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 16, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 8, above, left, 3, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_8x32_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 8, 32, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x8_c(uint8_t *dst, ptrdiff_t stride,
+                             const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 8, above, left, 3, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_16x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 32, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_32x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 16, above, left, 4, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_16x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 16, 64, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_64x16_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 16, above, left, 4, DC_MULTIPLIER_1X4);
+}
+
+void aom_dc_predictor_32x64_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 32, 64, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+void aom_dc_predictor_64x32_c(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  dc_predictor_rect(dst, stride, 64, 32, above, left, 5, DC_MULTIPLIER_1X2);
+}
+
+#undef DC_MULTIPLIER_1X2
+#undef DC_MULTIPLIER_1X4
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)left;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    memcpy(dst, above, bw * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)bd;
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, left[r], bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_paeth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                          int bw, int bh, const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  int r, c;
+  const uint16_t ytop_left = above[-1];
+  (void)bd;
+
+  for (r = 0; r < bh; r++) {
+    for (c = 0; c < bw; c++)
+      dst[c] = paeth_predictor_single(left[r], above[c], ytop_left);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];   // estimated by bottom-left pixel
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights_w = smooth_weights + bw - 4;
+  const uint8_t *const sm_weights_h = smooth_weights + bh - 4;
+  // scale = 2 * 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = 1 + SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights_w, sm_weights_h, scale,
+                           log2_scale + sizeof(*dst));
+  int r;
+  for (r = 0; r < bh; ++r) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred, left[r], right_pred };
+      const uint8_t weights[] = { sm_weights_h[r], scale - sm_weights_h[r],
+                                  sm_weights_w[c], scale - sm_weights_w[c] };
+      uint32_t this_pred = 0;
+      int i;
+      assert(scale >= sm_weights_h[r] && scale >= sm_weights_w[c]);
+      for (i = 0; i < 4; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t below_pred = left[bh - 1];  // estimated by bottom-left pixel
+  const uint8_t *const sm_weights = smooth_weights + bh - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { above[c], below_pred };
+      const uint8_t weights[] = { sm_weights[r], scale - sm_weights[r] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[r]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_smooth_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                             int bw, int bh,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  (void)bd;
+  const uint16_t right_pred = above[bw - 1];  // estimated by top-right pixel
+  const uint8_t *const sm_weights = smooth_weights + bw - 4;
+  // scale = 2^SMOOTH_WEIGHT_LOG2_SCALE
+  const int log2_scale = SMOOTH_WEIGHT_LOG2_SCALE;
+  const uint16_t scale = (1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  sm_weights_sanity_checks(sm_weights, sm_weights, scale,
+                           log2_scale + sizeof(*dst));
+
+  int r;
+  for (r = 0; r < bh; r++) {
+    int c;
+    for (c = 0; c < bw; ++c) {
+      const uint16_t pixels[] = { left[r], right_pred };
+      const uint8_t weights[] = { sm_weights[c], scale - sm_weights[c] };
+      uint32_t this_pred = 0;
+      assert(scale >= sm_weights[c]);
+      int i;
+      for (i = 0; i < 2; ++i) {
+        this_pred += weights[i] * pixels[i];
+      }
+      dst[c] = divide_round(this_pred, log2_scale);
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int r;
+  (void)above;
+  (void)left;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, 128 << (bd - 8), bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < bh; i++) sum += left[i];
+  expected_dc = (sum + (bh >> 1)) / bh;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride,
+                                           int bw, int bh,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  (void)left;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) sum += above[i];
+  expected_dc = (sum + (bw >> 1)) / bw;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, int bw,
+                                       int bh, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i, r, expected_dc, sum = 0;
+  const int count = bw + bh;
+  (void)bd;
+
+  for (i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  expected_dc = (sum + (count >> 1)) / count;
+
+  for (r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+// Obtained similarly as DC_MULTIPLIER_1X2 and DC_MULTIPLIER_1X4 above, but
+// assume 2nd shift of 17 bits instead of 16.
+// Note: Strictly speaking, 2nd shift needs to be 17 only when:
+// - bit depth == 12, and
+// - bw + bh is divisible by 5 (as opposed to divisible by 3).
+// All other cases can use half the multipliers with a shift of 16 instead.
+// This special optimization can be used when writing assembly code.
+#define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB
+// Note: This constant is odd, but a smaller even constant (0x199a) with the
+// appropriate shift should work for neon in 8/10-bit.
+#define HIGHBD_DC_MULTIPLIER_1X4 0x6667
+
+#define HIGHBD_DC_SHIFT2 17
+
+static INLINE void highbd_dc_predictor_rect(uint16_t *dst, ptrdiff_t stride,
+                                            int bw, int bh,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd,
+                                            int shift1, uint32_t multiplier) {
+  int sum = 0;
+  (void)bd;
+
+  for (int i = 0; i < bw; i++) {
+    sum += above[i];
+  }
+  for (int i = 0; i < bh; i++) {
+    sum += left[i];
+  }
+
+  const int expected_dc = divide_using_multiply_shift(
+      sum + ((bw + bh) >> 1), shift1, multiplier, HIGHBD_DC_SHIFT2);
+  assert(expected_dc < (1 << bd));
+
+  for (int r = 0; r < bh; r++) {
+    aom_memset16(dst, expected_dc, bw);
+    dst += stride;
+  }
+}
+
+#undef HIGHBD_DC_SHIFT2
+
+void aom_highbd_dc_predictor_4x8_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 8, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x4_c(uint16_t *dst, ptrdiff_t stride,
+                                   const uint16_t *above, const uint16_t *left,
+                                   int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_4x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 4, 16, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x4_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 4, above, left, bd, 2,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_8x16_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 16, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_8x32_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 8, 32, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x8_c(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 8, above, left, bd, 3,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_16x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 32, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_32x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_16x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 16, 64, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_64x16_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 16, above, left, bd, 4,
+                           HIGHBD_DC_MULTIPLIER_1X4);
+}
+
+void aom_highbd_dc_predictor_32x64_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 32, 64, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+void aom_highbd_dc_predictor_64x32_c(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  highbd_dc_predictor_rect(dst, stride, 64, 32, above, left, bd, 5,
+                           HIGHBD_DC_MULTIPLIER_1X2);
+}
+
+#undef HIGHBD_DC_MULTIPLIER_1X2
+#undef HIGHBD_DC_MULTIPLIER_1X4
+
+// This serves as a wrapper function, so that all the prediction functions
+// can be unified and accessed as a pointer array. Note that the boundary
+// above and left are not necessarily used all the time.
+#define intra_pred_sized(type, width, height)                  \
+  void aom_##type##_predictor_##width##x##height##_c(          \
+      uint8_t *dst, ptrdiff_t stride, const uint8_t *above,    \
+      const uint8_t *left) {                                   \
+    type##_predictor(dst, stride, width, height, above, left); \
+  }
+
+#define intra_pred_highbd_sized(type, width, height)                        \
+  void aom_highbd_##type##_predictor_##width##x##height##_c(                \
+      uint16_t *dst, ptrdiff_t stride, const uint16_t *above,               \
+      const uint16_t *left, int bd) {                                       \
+    highbd_##type##_predictor(dst, stride, width, height, above, left, bd); \
+  }
+
+/* clang-format off */
+#define intra_pred_rectangular(type) \
+  intra_pred_sized(type, 4, 8) \
+  intra_pred_sized(type, 8, 4) \
+  intra_pred_sized(type, 8, 16) \
+  intra_pred_sized(type, 16, 8) \
+  intra_pred_sized(type, 16, 32) \
+  intra_pred_sized(type, 32, 16) \
+  intra_pred_sized(type, 32, 64) \
+  intra_pred_sized(type, 64, 32) \
+  intra_pred_sized(type, 4, 16) \
+  intra_pred_sized(type, 16, 4) \
+  intra_pred_sized(type, 8, 32) \
+  intra_pred_sized(type, 32, 8) \
+  intra_pred_sized(type, 16, 64) \
+  intra_pred_sized(type, 64, 16) \
+  intra_pred_highbd_sized(type, 4, 8) \
+  intra_pred_highbd_sized(type, 8, 4) \
+  intra_pred_highbd_sized(type, 8, 16) \
+  intra_pred_highbd_sized(type, 16, 8) \
+  intra_pred_highbd_sized(type, 16, 32) \
+  intra_pred_highbd_sized(type, 32, 16) \
+  intra_pred_highbd_sized(type, 32, 64) \
+  intra_pred_highbd_sized(type, 64, 32) \
+  intra_pred_highbd_sized(type, 4, 16) \
+  intra_pred_highbd_sized(type, 16, 4) \
+  intra_pred_highbd_sized(type, 8, 32) \
+  intra_pred_highbd_sized(type, 32, 8) \
+  intra_pred_highbd_sized(type, 16, 64) \
+  intra_pred_highbd_sized(type, 64, 16)
+
+#define intra_pred_above_4x4(type) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64) \
+  intra_pred_rectangular(type)
+#define intra_pred_allsizes(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_above_4x4(type)
+#define intra_pred_square(type) \
+  intra_pred_sized(type, 4, 4) \
+  intra_pred_sized(type, 8, 8) \
+  intra_pred_sized(type, 16, 16) \
+  intra_pred_sized(type, 32, 32) \
+  intra_pred_sized(type, 64, 64) \
+  intra_pred_highbd_sized(type, 4, 4) \
+  intra_pred_highbd_sized(type, 8, 8) \
+  intra_pred_highbd_sized(type, 16, 16) \
+  intra_pred_highbd_sized(type, 32, 32) \
+  intra_pred_highbd_sized(type, 64, 64)
+
+intra_pred_allsizes(v)
+intra_pred_allsizes(h)
+intra_pred_allsizes(smooth)
+intra_pred_allsizes(smooth_v)
+intra_pred_allsizes(smooth_h)
+intra_pred_allsizes(paeth)
+intra_pred_allsizes(dc_128)
+intra_pred_allsizes(dc_left)
+intra_pred_allsizes(dc_top)
+intra_pred_square(dc)
+/* clang-format on */
+#undef intra_pred_allsizes
diff --git a/third_party/aom/aom_dsp/intrapred_common.h b/third_party/aom/aom_dsp/intrapred_common.h
new file mode 100644
index 0000000000..6172224be1
--- /dev/null
+++ b/third_party/aom/aom_dsp/intrapred_common.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_INTRAPRED_COMMON_H_
+#define AOM_AOM_DSP_INTRAPRED_COMMON_H_
+
+#include "config/aom_config.h"
+
+// Weights are quadratic from '1' to '1 / block_size', scaled by
+// 2^SMOOTH_WEIGHT_LOG2_SCALE.
+#define SMOOTH_WEIGHT_LOG2_SCALE 8
+
+// Note these arrays are aligned to ensure NEON loads using a cast to uint32_t*
+// have sufficient alignment. Using 8 preserves the potential for an alignment
+// hint in load_weight_w8(). For that case, this could be increased to 16 to
+// allow an aligned load in x86.
+DECLARE_ALIGNED(8, static const uint8_t, smooth_weights[]) = {
+  // bs = 4
+  255, 149, 85, 64,
+  // bs = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // bs = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // bs = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+  // bs = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+DECLARE_ALIGNED(8, static const uint16_t, smooth_weights_u16[]) = {
+  // block dimension = 4
+  255, 149, 85, 64,
+  // block dimension = 8
+  255, 197, 146, 105, 73, 50, 37, 32,
+  // block dimension = 16
+  255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+  // block dimension = 32
+  255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+  66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+  // block dimension = 64
+  255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+  150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73, 69,
+  65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16, 15,
+  13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4
+};
+
+#endif  // AOM_AOM_DSP_INTRAPRED_COMMON_H_
diff --git a/third_party/aom/aom_dsp/loopfilter.c b/third_party/aom/aom_dsp/loopfilter.c
new file mode 100644
index 0000000000..075f13689c
--- /dev/null
+++ b/third_party/aom/aom_dsp/loopfilter.c
@@ -0,0 +1,997 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/mem.h"
+
+static INLINE int8_t signed_char_clamp(int t) {
+  return (int8_t)clamp(t, -128, 127);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE int16_t signed_char_clamp_high(int t, int bd) {
+  switch (bd) {
+    case 10: return (int16_t)clamp(t, -128 * 4, 128 * 4 - 1);
+    case 12: return (int16_t)clamp(t, -128 * 16, 128 * 16 - 1);
+    case 8:
+    default: return (int16_t)clamp(t, -128, 128 - 1);
+  }
+}
+#endif
+
+// should we apply any filter at all: 11111111 yes, 00000000 no
+static INLINE int8_t filter_mask2(uint8_t limit, uint8_t blimit, uint8_t p1,
+                                  uint8_t p0, uint8_t q0, uint8_t q1) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask(uint8_t limit, uint8_t blimit, uint8_t p3,
+                                 uint8_t p2, uint8_t p1, uint8_t p0, uint8_t q0,
+                                 uint8_t q1, uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p3 - p2) > limit) * -1;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(q3 - q2) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                         uint8_t p2, uint8_t p1, uint8_t p0,
+                                         uint8_t q0, uint8_t q1, uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p2 - p1) > limit) * -1;
+  mask |= (abs(p1 - p0) > limit) * -1;
+  mask |= (abs(q1 - q0) > limit) * -1;
+  mask |= (abs(q2 - q1) > limit) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask3_chroma(uint8_t thresh, uint8_t p2, uint8_t p1,
+                                       uint8_t p0, uint8_t q0, uint8_t q1,
+                                       uint8_t q2) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t flat_mask4(uint8_t thresh, uint8_t p3, uint8_t p2,
+                                uint8_t p1, uint8_t p0, uint8_t q0, uint8_t q1,
+                                uint8_t q2, uint8_t q3) {
+  int8_t mask = 0;
+  mask |= (abs(p1 - p0) > thresh) * -1;
+  mask |= (abs(q1 - q0) > thresh) * -1;
+  mask |= (abs(p2 - p0) > thresh) * -1;
+  mask |= (abs(q2 - q0) > thresh) * -1;
+  mask |= (abs(p3 - p0) > thresh) * -1;
+  mask |= (abs(q3 - q0) > thresh) * -1;
+  return ~mask;
+}
+
+// is there high edge variance internal edge: 11111111 yes, 00000000 no
+static INLINE int8_t hev_mask(uint8_t thresh, uint8_t p1, uint8_t p0,
+                              uint8_t q0, uint8_t q1) {
+  int8_t hev = 0;
+  hev |= (abs(p1 - p0) > thresh) * -1;
+  hev |= (abs(q1 - q0) > thresh) * -1;
+  return hev;
+}
+
+static INLINE void filter4(int8_t mask, uint8_t thresh, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1) {
+  int8_t filter1, filter2;
+
+  const int8_t ps1 = (int8_t)(*op1 ^ 0x80);
+  const int8_t ps0 = (int8_t)(*op0 ^ 0x80);
+  const int8_t qs0 = (int8_t)(*oq0 ^ 0x80);
+  const int8_t qs1 = (int8_t)(*oq1 ^ 0x80);
+  const int8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1);
+
+  // add outer taps if we have high edge variance
+  int8_t filter = signed_char_clamp(ps1 - qs1) & hev;
+
+  // inner taps
+  filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask;
+
+  // save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way
+  filter1 = signed_char_clamp(filter + 4) >> 3;
+  filter2 = signed_char_clamp(filter + 3) >> 3;
+
+  *oq0 = (uint8_t)(signed_char_clamp(qs0 - filter1) ^ 0x80);
+  *op0 = (uint8_t)(signed_char_clamp(ps0 + filter2) ^ 0x80);
+
+  // outer tap adjustments
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = (uint8_t)(signed_char_clamp(qs1 - filter) ^ 0x80);
+  *op1 = (uint8_t)(signed_char_clamp(ps1 + filter) ^ 0x80);
+}
+
+void aom_lpf_horizontal_4_c(uint8_t *s, int p /* pitch */,
+                            const uint8_t *blimit, const uint8_t *limit,
+                            const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_4_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_4_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_4_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_4_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1];
+    const int8_t mask = filter_mask2(*limit, *blimit, p1, p0, q0, q1);
+    filter4(mask, *thresh, s - 2, s - 1, s, s + 1);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_4_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_4_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_4_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+static INLINE void filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op2, uint8_t *op1, uint8_t *op0,
+                           uint8_t *oq0, uint8_t *oq1, uint8_t *oq2) {
+  if (flat && mask) {
+    const uint8_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+static INLINE void filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                           uint8_t *op3, uint8_t *op2, uint8_t *op1,
+                           uint8_t *op0, uint8_t *oq0, uint8_t *oq1,
+                           uint8_t *oq2, uint8_t *oq3) {
+  if (flat && mask) {
+    const uint8_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    filter4(mask, thresh, op1, op0, oq0, oq1);
+  }
+}
+
+void aom_lpf_horizontal_6_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+            s + 2 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_6_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_6_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_6_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_horizontal_8_c(uint8_t *s, int p, const uint8_t *blimit,
+                            const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s,
+            s + 1 * p, s + 2 * p, s + 3 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_8_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0, const uint8_t *thresh0,
+                                 const uint8_t *blimit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_horizontal_8_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                 const uint8_t *limit0,
+                                 const uint8_t *thresh0) {
+  aom_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 4, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 8, p, blimit0, limit0, thresh0);
+  aom_lpf_horizontal_8_c(s + 12, p, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_6_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2);
+    const int8_t flat = flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2);
+    filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_6_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_6_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_6_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+void aom_lpf_vertical_8_c(uint8_t *s, int pitch, const uint8_t *blimit,
+                          const uint8_t *limit, const uint8_t *thresh) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+            s + 3);
+    s += pitch;
+  }
+}
+
+void aom_lpf_vertical_8_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0,
+                               const uint8_t *blimit1, const uint8_t *limit1,
+                               const uint8_t *thresh1) {
+  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1);
+}
+
+void aom_lpf_vertical_8_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                               const uint8_t *limit0, const uint8_t *thresh0) {
+  aom_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 8 * pitch, pitch, blimit0, limit0, thresh0);
+  aom_lpf_vertical_8_c(s + 12 * pitch, pitch, blimit0, limit0, thresh0);
+}
+
+static INLINE void filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                            int8_t flat2, uint8_t *op6, uint8_t *op5,
+                            uint8_t *op4, uint8_t *op3, uint8_t *op2,
+                            uint8_t *op1, uint8_t *op0, uint8_t *oq0,
+                            uint8_t *oq1, uint8_t *oq2, uint8_t *oq3,
+                            uint8_t *oq4, uint8_t *oq5, uint8_t *oq6) {
+  if (flat2 && flat && mask) {
+    const uint8_t p6 = *op6, p5 = *op5, p4 = *op4, p3 = *op3, p2 = *op2,
+                  p1 = *op1, p0 = *op0;
+    const uint8_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3, q4 = *oq4,
+                  q5 = *oq5, q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3);
+  }
+}
+
+static void mb_lpf_horizontal_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int count) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint8_t p6 = s[-7 * p], p5 = s[-6 * p], p4 = s[-5 * p],
+                  p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p],
+                  q4 = s[4 * p], q5 = s[5 * p], q6 = s[6 * p];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+             s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+             s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p);
+    ++s;
+  }
+}
+
+void aom_lpf_horizontal_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                             const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_horizontal_edge_w(s, p, blimit, limit, thresh, 1);
+}
+
+void aom_lpf_horizontal_14_dual_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0, const uint8_t *thresh0,
+                                  const uint8_t *blimit1, const uint8_t *limit1,
+                                  const uint8_t *thresh1) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1);
+}
+
+void aom_lpf_horizontal_14_quad_c(uint8_t *s, int p, const uint8_t *blimit0,
+                                  const uint8_t *limit0,
+                                  const uint8_t *thresh0) {
+  mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 4, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 8, p, blimit0, limit0, thresh0, 1);
+  mb_lpf_horizontal_edge_w(s + 12, p, blimit0, limit0, thresh0, 1);
+}
+
+static void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int count) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint8_t p6 = s[-7], p5 = s[-6], p4 = s[-5], p3 = s[-4], p2 = s[-3],
+                  p1 = s[-2], p0 = s[-1];
+    const uint8_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3], q4 = s[4],
+                  q5 = s[5], q6 = s[6];
+    const int8_t mask =
+        filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+    const int8_t flat2 = flat_mask4(1, p6, p5, p4, p0, q0, q4, q5, q6);
+
+    filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4, s - 3,
+             s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5, s + 6);
+    s += p;
+  }
+}
+
+void aom_lpf_vertical_14_c(uint8_t *s, int p, const uint8_t *blimit,
+                           const uint8_t *limit, const uint8_t *thresh) {
+  mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4);
+}
+
+void aom_lpf_vertical_14_dual_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0,
+                                const uint8_t *blimit1, const uint8_t *limit1,
+                                const uint8_t *thresh1) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1, 4);
+}
+
+void aom_lpf_vertical_14_quad_c(uint8_t *s, int pitch, const uint8_t *blimit0,
+                                const uint8_t *limit0, const uint8_t *thresh0) {
+  mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 8 * pitch, pitch, blimit0, limit0, thresh0, 4);
+  mb_lpf_vertical_edge_w(s + 12 * pitch, pitch, blimit0, limit0, thresh0, 4);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask2(uint8_t limit, uint8_t blimit,
+                                         uint16_t p1, uint16_t p0, uint16_t q0,
+                                         uint16_t q1, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+// Should we apply any filter at all: 11111111 yes, 00000000 no ?
+static INLINE int8_t highbd_filter_mask(uint8_t limit, uint8_t blimit,
+                                        uint16_t p3, uint16_t p2, uint16_t p1,
+                                        uint16_t p0, uint16_t q0, uint16_t q1,
+                                        uint16_t q2, uint16_t q3, int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p3 - p2) > limit16) * -1;
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(q3 - q2) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_filter_mask3_chroma(uint8_t limit, uint8_t blimit,
+                                                uint16_t p2, uint16_t p1,
+                                                uint16_t p0, uint16_t q0,
+                                                uint16_t q1, uint16_t q2,
+                                                int bd) {
+  int8_t mask = 0;
+  int16_t limit16 = (uint16_t)limit << (bd - 8);
+  int16_t blimit16 = (uint16_t)blimit << (bd - 8);
+  mask |= (abs(p2 - p1) > limit16) * -1;
+  mask |= (abs(p1 - p0) > limit16) * -1;
+  mask |= (abs(q1 - q0) > limit16) * -1;
+  mask |= (abs(q2 - q1) > limit16) * -1;
+  mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask3_chroma(uint8_t thresh, uint16_t p2,
+                                              uint16_t p1, uint16_t p0,
+                                              uint16_t q0, uint16_t q1,
+                                              uint16_t q2, int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+static INLINE int8_t highbd_flat_mask4(uint8_t thresh, uint16_t p3, uint16_t p2,
+                                       uint16_t p1, uint16_t p0, uint16_t q0,
+                                       uint16_t q1, uint16_t q2, uint16_t q3,
+                                       int bd) {
+  int8_t mask = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  mask |= (abs(p1 - p0) > thresh16) * -1;
+  mask |= (abs(q1 - q0) > thresh16) * -1;
+  mask |= (abs(p2 - p0) > thresh16) * -1;
+  mask |= (abs(q2 - q0) > thresh16) * -1;
+  mask |= (abs(p3 - p0) > thresh16) * -1;
+  mask |= (abs(q3 - q0) > thresh16) * -1;
+  return ~mask;
+}
+
+// Is there high edge variance internal edge:
+// 11111111_11111111 yes, 00000000_00000000 no ?
+static INLINE int16_t highbd_hev_mask(uint8_t thresh, uint16_t p1, uint16_t p0,
+                                      uint16_t q0, uint16_t q1, int bd) {
+  int16_t hev = 0;
+  int16_t thresh16 = (uint16_t)thresh << (bd - 8);
+  hev |= (abs(p1 - p0) > thresh16) * -1;
+  hev |= (abs(q1 - q0) > thresh16) * -1;
+  return hev;
+}
+
+static INLINE void highbd_filter4(int8_t mask, uint8_t thresh, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  int bd) {
+  int16_t filter1, filter2;
+  // ^0x80 equivalent to subtracting 0x80 from the values to turn them
+  // into -128 to +127 instead of 0 to 255.
+  int shift = bd - 8;
+  const int16_t ps1 = (int16_t)*op1 - (0x80 << shift);
+  const int16_t ps0 = (int16_t)*op0 - (0x80 << shift);
+  const int16_t qs0 = (int16_t)*oq0 - (0x80 << shift);
+  const int16_t qs1 = (int16_t)*oq1 - (0x80 << shift);
+  const int16_t hev = highbd_hev_mask(thresh, *op1, *op0, *oq0, *oq1, bd);
+
+  // Add outer taps if we have high edge variance.
+  int16_t filter = signed_char_clamp_high(ps1 - qs1, bd) & hev;
+
+  // Inner taps.
+  filter = signed_char_clamp_high(filter + 3 * (qs0 - ps0), bd) & mask;
+
+  // Save bottom 3 bits so that we round one side +4 and the other +3
+  // if it equals 4 we'll set to adjust by -1 to account for the fact
+  // we'd round 3 the other way.
+  filter1 = signed_char_clamp_high(filter + 4, bd) >> 3;
+  filter2 = signed_char_clamp_high(filter + 3, bd) >> 3;
+
+  *oq0 = signed_char_clamp_high(qs0 - filter1, bd) + (0x80 << shift);
+  *op0 = signed_char_clamp_high(ps0 + filter2, bd) + (0x80 << shift);
+
+  // Outer tap adjustments.
+  filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev;
+
+  *oq1 = signed_char_clamp_high(qs1 - filter, bd) + (0x80 << shift);
+  *op1 = signed_char_clamp_high(ps1 + filter, bd) + (0x80 << shift);
+}
+
+void aom_highbd_lpf_horizontal_4_c(uint16_t *s, int p /* pitch */,
+                                   const uint8_t *blimit, const uint8_t *limit,
+                                   const uint8_t *thresh, int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2 * p, s - 1 * p, s, s + 1 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_4_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_4_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1];
+    const int8_t mask =
+        highbd_filter_mask2(*limit, *blimit, p1, p0, q0, q1, bd);
+    highbd_filter4(mask, *thresh, s - 2, s - 1, s, s + 1, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_4_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_4_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter6(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op2, uint16_t *op1, uint16_t *op0,
+                                  uint16_t *oq0, uint16_t *oq1, uint16_t *oq2,
+                                  int bd) {
+  if (flat && mask) {
+    const uint16_t p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2;
+
+    // 5-tap filter [1, 2, 2, 2, 1]
+    *op1 = ROUND_POWER_OF_TWO(p2 * 3 + p1 * 2 + p0 * 2 + q0, 3);
+    *op0 = ROUND_POWER_OF_TWO(p2 + p1 * 2 + p0 * 2 + q0 * 2 + q1, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p1 + p0 * 2 + q0 * 2 + q1 * 2 + q2, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p0 + q0 * 2 + q1 * 2 + q2 * 3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+static INLINE void highbd_filter8(int8_t mask, uint8_t thresh, int8_t flat,
+                                  uint16_t *op3, uint16_t *op2, uint16_t *op1,
+                                  uint16_t *op0, uint16_t *oq0, uint16_t *oq1,
+                                  uint16_t *oq2, uint16_t *oq3, int bd) {
+  if (flat && mask) {
+    const uint16_t p3 = *op3, p2 = *op2, p1 = *op1, p0 = *op0;
+    const uint16_t q0 = *oq0, q1 = *oq1, q2 = *oq2, q3 = *oq3;
+
+    // 7-tap filter [1, 1, 1, 2, 1, 1, 1]
+    *op2 = ROUND_POWER_OF_TWO(p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0, 3);
+    *op1 = ROUND_POWER_OF_TWO(p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1, 3);
+    *op0 = ROUND_POWER_OF_TWO(p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2, 3);
+    *oq0 = ROUND_POWER_OF_TWO(p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3, 3);
+    *oq1 = ROUND_POWER_OF_TWO(p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3, 3);
+    *oq2 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3, 3);
+  } else {
+    highbd_filter4(mask, thresh, op1, op0, oq0, oq1, bd);
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4 * p, s - 3 * p, s - 2 * p,
+                   s - 1 * p, s, s + 1 * p, s + 2 * p, s + 3 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_c(uint16_t *s, int p, const uint8_t *blimit,
+                                   const uint8_t *limit, const uint8_t *thresh,
+                                   int bd) {
+  int i;
+  int count = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+    const uint16_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p];
+
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3 * p, s - 2 * p, s - 1 * p, s,
+                   s + 1 * p, s + 2 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_6_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_6_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_c(s, p, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_horizontal_8_c(s + 4, p, blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_6_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2];
+    const int8_t mask =
+        highbd_filter_mask3_chroma(*limit, *blimit, p2, p1, p0, q0, q1, q2, bd);
+    const int8_t flat = highbd_flat_mask3_chroma(1, p2, p1, p0, q0, q1, q2, bd);
+    highbd_filter6(mask, *thresh, flat, s - 3, s - 2, s - 1, s, s + 1, s + 2,
+                   bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_6_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_6_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_6_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+void aom_highbd_lpf_vertical_8_c(uint16_t *s, int pitch, const uint8_t *blimit,
+                                 const uint8_t *limit, const uint8_t *thresh,
+                                 int bd) {
+  int i;
+  int count = 4;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4], p2 = s[-3], p1 = s[-2], p0 = s[-1];
+    const uint16_t q0 = s[0], q1 = s[1], q2 = s[2], q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    highbd_filter8(mask, *thresh, flat, s - 4, s - 3, s - 2, s - 1, s, s + 1,
+                   s + 2, s + 3, bd);
+    s += pitch;
+  }
+}
+
+void aom_highbd_lpf_vertical_8_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_c(s, pitch, blimit0, limit0, thresh0, bd);
+  aom_highbd_lpf_vertical_8_c(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                              bd);
+}
+
+static INLINE void highbd_filter14(int8_t mask, uint8_t thresh, int8_t flat,
+                                   int8_t flat2, uint16_t *op6, uint16_t *op5,
+                                   uint16_t *op4, uint16_t *op3, uint16_t *op2,
+                                   uint16_t *op1, uint16_t *op0, uint16_t *oq0,
+                                   uint16_t *oq1, uint16_t *oq2, uint16_t *oq3,
+                                   uint16_t *oq4, uint16_t *oq5, uint16_t *oq6,
+                                   int bd) {
+  if (flat2 && flat && mask) {
+    const uint16_t p6 = *op6;
+    const uint16_t p5 = *op5;
+    const uint16_t p4 = *op4;
+    const uint16_t p3 = *op3;
+    const uint16_t p2 = *op2;
+    const uint16_t p1 = *op1;
+    const uint16_t p0 = *op0;
+    const uint16_t q0 = *oq0;
+    const uint16_t q1 = *oq1;
+    const uint16_t q2 = *oq2;
+    const uint16_t q3 = *oq3;
+    const uint16_t q4 = *oq4;
+    const uint16_t q5 = *oq5;
+    const uint16_t q6 = *oq6;
+
+    // 13-tap filter [1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 1, 1, 1]
+    *op5 = ROUND_POWER_OF_TWO(p6 * 7 + p5 * 2 + p4 * 2 + p3 + p2 + p1 + p0 + q0,
+                              4);
+    *op4 = ROUND_POWER_OF_TWO(
+        p6 * 5 + p5 * 2 + p4 * 2 + p3 * 2 + p2 + p1 + p0 + q0 + q1, 4);
+    *op3 = ROUND_POWER_OF_TWO(
+        p6 * 4 + p5 + p4 * 2 + p3 * 2 + p2 * 2 + p1 + p0 + q0 + q1 + q2, 4);
+    *op2 = ROUND_POWER_OF_TWO(
+        p6 * 3 + p5 + p4 + p3 * 2 + p2 * 2 + p1 * 2 + p0 + q0 + q1 + q2 + q3,
+        4);
+    *op1 = ROUND_POWER_OF_TWO(p6 * 2 + p5 + p4 + p3 + p2 * 2 + p1 * 2 + p0 * 2 +
+                                  q0 + q1 + q2 + q3 + q4,
+                              4);
+    *op0 = ROUND_POWER_OF_TWO(p6 + p5 + p4 + p3 + p2 + p1 * 2 + p0 * 2 +
+                                  q0 * 2 + q1 + q2 + q3 + q4 + q5,
+                              4);
+    *oq0 = ROUND_POWER_OF_TWO(p5 + p4 + p3 + p2 + p1 + p0 * 2 + q0 * 2 +
+                                  q1 * 2 + q2 + q3 + q4 + q5 + q6,
+                              4);
+    *oq1 = ROUND_POWER_OF_TWO(p4 + p3 + p2 + p1 + p0 + q0 * 2 + q1 * 2 +
+                                  q2 * 2 + q3 + q4 + q5 + q6 * 2,
+                              4);
+    *oq2 = ROUND_POWER_OF_TWO(
+        p3 + p2 + p1 + p0 + q0 + q1 * 2 + q2 * 2 + q3 * 2 + q4 + q5 + q6 * 3,
+        4);
+    *oq3 = ROUND_POWER_OF_TWO(
+        p2 + p1 + p0 + q0 + q1 + q2 * 2 + q3 * 2 + q4 * 2 + q5 + q6 * 4, 4);
+    *oq4 = ROUND_POWER_OF_TWO(
+        p1 + p0 + q0 + q1 + q2 + q3 * 2 + q4 * 2 + q5 * 2 + q6 * 5, 4);
+    *oq5 = ROUND_POWER_OF_TWO(p0 + q0 + q1 + q2 + q3 + q4 * 2 + q5 * 2 + q6 * 7,
+                              4);
+  } else {
+    highbd_filter8(mask, thresh, flat, op3, op2, op1, op0, oq0, oq1, oq2, oq3,
+                   bd);
+  }
+}
+
+static void highbd_mb_lpf_horizontal_edge_w(uint16_t *s, int p,
+                                            const uint8_t *blimit,
+                                            const uint8_t *limit,
+                                            const uint8_t *thresh, int count,
+                                            int bd) {
+  int i;
+  int step = 4;
+
+  // loop filter designed to work using chars so that we can make maximum use
+  // of 8 bit simd instructions.
+  for (i = 0; i < step * count; ++i) {
+    const uint16_t p3 = s[-4 * p];
+    const uint16_t p2 = s[-3 * p];
+    const uint16_t p1 = s[-2 * p];
+    const uint16_t p0 = s[-p];
+    const uint16_t q0 = s[0 * p];
+    const uint16_t q1 = s[1 * p];
+    const uint16_t q2 = s[2 * p];
+    const uint16_t q3 = s[3 * p];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7 * p], s[-6 * p], s[-5 * p], p0, q0, s[4 * p],
+                          s[5 * p], s[6 * p], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7 * p, s - 6 * p, s - 5 * p,
+                    s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, s, s + 1 * p,
+                    s + 2 * p, s + 3 * p, s + 4 * p, s + 5 * p, s + 6 * p, bd);
+    ++s;
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_c(uint16_t *s, int pitch,
+                                    const uint8_t *blimit, const uint8_t *limit,
+                                    const uint8_t *thresh, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, pitch, blimit, limit, thresh, 1, bd);
+}
+
+void aom_highbd_lpf_horizontal_14_dual_c(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_horizontal_edge_w(s, p, blimit0, limit0, thresh0, 1, bd);
+  highbd_mb_lpf_horizontal_edge_w(s + 4, p, blimit1, limit1, thresh1, 1, bd);
+}
+
+static void highbd_mb_lpf_vertical_edge_w(uint16_t *s, int p,
+                                          const uint8_t *blimit,
+                                          const uint8_t *limit,
+                                          const uint8_t *thresh, int count,
+                                          int bd) {
+  int i;
+
+  for (i = 0; i < count; ++i) {
+    const uint16_t p3 = s[-4];
+    const uint16_t p2 = s[-3];
+    const uint16_t p1 = s[-2];
+    const uint16_t p0 = s[-1];
+    const uint16_t q0 = s[0];
+    const uint16_t q1 = s[1];
+    const uint16_t q2 = s[2];
+    const uint16_t q3 = s[3];
+    const int8_t mask =
+        highbd_filter_mask(*limit, *blimit, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat =
+        highbd_flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3, bd);
+    const int8_t flat2 =
+        highbd_flat_mask4(1, s[-7], s[-6], s[-5], p0, q0, s[4], s[5], s[6], bd);
+
+    highbd_filter14(mask, *thresh, flat, flat2, s - 7, s - 6, s - 5, s - 4,
+                    s - 3, s - 2, s - 1, s, s + 1, s + 2, s + 3, s + 4, s + 5,
+                    s + 6, bd);
+    s += p;
+  }
+}
+
+void aom_highbd_lpf_vertical_14_c(uint16_t *s, int p, const uint8_t *blimit,
+                                  const uint8_t *limit, const uint8_t *thresh,
+                                  int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, p, blimit, limit, thresh, 4, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_c(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  highbd_mb_lpf_vertical_edge_w(s, pitch, blimit0, limit0, thresh0, 4, bd);
+  highbd_mb_lpf_vertical_edge_w(s + 4 * pitch, pitch, blimit1, limit1, thresh1,
+                                4, bd);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/mathutils.h b/third_party/aom/aom_dsp/mathutils.h
new file mode 100644
index 0000000000..cbb6cf491f
--- /dev/null
+++ b/third_party/aom/aom_dsp/mathutils.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_MATHUTILS_H_
+#define AOM_AOM_DSP_MATHUTILS_H_
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+
+static const double TINY_NEAR_ZERO = 1.0E-16;
+
+// Solves Ax = b, where x and b are column vectors of size nx1 and A is nxn
+static INLINE int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    // Bring the largest magnitude to the diagonal position
+    for (i = n - 1; i > k; i--) {
+      if (fabs(A[(i - 1) * stride + k]) < fabs(A[i * stride + k])) {
+        for (j = 0; j < n; j++) {
+          c = A[i * stride + j];
+          A[i * stride + j] = A[(i - 1) * stride + j];
+          A[(i - 1) * stride + j] = c;
+        }
+        c = b[i];
+        b[i] = b[i - 1];
+        b[i - 1] = c;
+      }
+    }
+    for (i = k; i < n - 1; i++) {
+      if (fabs(A[k * stride + k]) < TINY_NEAR_ZERO) return 0;
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++) A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < TINY_NEAR_ZERO) return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++) c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+
+  return 1;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// Least-squares
+// Solves for n-dim x in a least squares sense to minimize |Ax - b|^2
+// The solution is simply x = (A'A)^-1 A'b or simply the solution for
+// the system: A'A x = A'b
+//
+// This process is split into three steps in order to avoid needing to
+// explicitly allocate the A matrix, which may be very large if there
+// are many equations to solve.
+//
+// The process for using this is (in pseudocode):
+//
+// Allocate mat (size n*n), y (size n), a (size n), x (size n)
+// least_squares_init(mat, y, n)
+// for each equation a . x = b {
+//    least_squares_accumulate(mat, y, a, b, n)
+// }
+// least_squares_solve(mat, y, x, n)
+//
+// where:
+// * mat, y are accumulators for the values A'A and A'b respectively,
+// * a, b are the coefficients of each individual equation,
+// * x is the result vector
+// * and n is the problem size
+static INLINE void least_squares_init(double *mat, double *y, int n) {
+  memset(mat, 0, n * n * sizeof(double));
+  memset(y, 0, n * sizeof(double));
+}
+
+// Round the given positive value to nearest integer
+static AOM_FORCE_INLINE int iroundpf(float x) {
+  assert(x >= 0.0);
+  return (int)(x + 0.5f);
+}
+
+static INLINE void least_squares_accumulate(double *mat, double *y,
+                                            const double *a, double b, int n) {
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n; j++) {
+      mat[i * n + j] += a[i] * a[j];
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    y[i] += a[i] * b;
+  }
+}
+
+static INLINE int least_squares_solve(double *mat, double *y, double *x,
+                                      int n) {
+  return linsolve(n, mat, n, y, x);
+}
+
+// Matrix multiply
+static INLINE void multiply_mat(const double *m1, const double *m2, double *res,
+                                const int m1_rows, const int inner_dim,
+                                const int m2_cols) {
+  double sum;
+
+  int row, col, inner;
+  for (row = 0; row < m1_rows; ++row) {
+    for (col = 0; col < m2_cols; ++col) {
+      sum = 0;
+      for (inner = 0; inner < inner_dim; ++inner)
+        sum += m1[row * inner_dim + inner] * m2[inner * m2_cols + col];
+      *(res++) = sum;
+    }
+  }
+}
+
+static AOM_INLINE float approx_exp(float y) {
+#define A ((1 << 23) / 0.69314718056f)  // (1 << 23) / ln(2)
+#define B \
+  127  // Offset for the exponent according to IEEE floating point standard.
+#define C 60801  // Magic number controls the accuracy of approximation
+  union {
+    float as_float;
+    int32_t as_int32;
+  } container;
+  container.as_int32 = ((int32_t)(y * A)) + ((B << 23) - C);
+  return container.as_float;
+#undef A
+#undef B
+#undef C
+}
+#endif  // AOM_AOM_DSP_MATHUTILS_H_
diff --git a/third_party/aom/aom_dsp/noise_model.c b/third_party/aom/aom_dsp/noise_model.c
new file mode 100644
index 0000000000..065ec9a106
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.c
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/mathutils.h"
+#include "aom_dsp/noise_model.h"
+#include "aom_dsp/noise_util.h"
+#include "aom_mem/aom_mem.h"
+
+#define kLowPolyNumParams 3
+
+static const int kMaxLag = 4;
+
+// Defines a function that can be used to obtain the mean of a block for the
+// provided data type (uint8_t, or uint16_t)
+#define GET_BLOCK_MEAN(INT_TYPE, suffix)                                    \
+  static double get_block_mean_##suffix(const INT_TYPE *data, int w, int h, \
+                                        int stride, int x_o, int y_o,       \
+                                        int block_size) {                   \
+    const int max_h = AOMMIN(h - y_o, block_size);                          \
+    const int max_w = AOMMIN(w - x_o, block_size);                          \
+    double block_mean = 0;                                                  \
+    for (int y = 0; y < max_h; ++y) {                                       \
+      for (int x = 0; x < max_w; ++x) {                                     \
+        block_mean += data[(y_o + y) * stride + x_o + x];                   \
+      }                                                                     \
+    }                                                                       \
+    return block_mean / (max_w * max_h);                                    \
+  }
+
+GET_BLOCK_MEAN(uint8_t, lowbd)
+GET_BLOCK_MEAN(uint16_t, highbd)
+
+static INLINE double get_block_mean(const uint8_t *data, int w, int h,
+                                    int stride, int x_o, int y_o,
+                                    int block_size, int use_highbd) {
+  if (use_highbd)
+    return get_block_mean_highbd((const uint16_t *)data, w, h, stride, x_o, y_o,
+                                 block_size);
+  return get_block_mean_lowbd(data, w, h, stride, x_o, y_o, block_size);
+}
+
+// Defines a function that can be used to obtain the variance of a block
+// for the provided data type (uint8_t, or uint16_t)
+#define GET_NOISE_VAR(INT_TYPE, suffix)                                  \
+  static double get_noise_var_##suffix(                                  \
+      const INT_TYPE *data, const INT_TYPE *denoised, int stride, int w, \
+      int h, int x_o, int y_o, int block_size_x, int block_size_y) {     \
+    const int max_h = AOMMIN(h - y_o, block_size_y);                     \
+    const int max_w = AOMMIN(w - x_o, block_size_x);                     \
+    double noise_var = 0;                                                \
+    double noise_mean = 0;                                               \
+    for (int y = 0; y < max_h; ++y) {                                    \
+      for (int x = 0; x < max_w; ++x) {                                  \
+        double noise = (double)data[(y_o + y) * stride + x_o + x] -      \
+                       denoised[(y_o + y) * stride + x_o + x];           \
+        noise_mean += noise;                                             \
+        noise_var += noise * noise;                                      \
+      }                                                                  \
+    }                                                                    \
+    noise_mean /= (max_w * max_h);                                       \
+    return noise_var / (max_w * max_h) - noise_mean * noise_mean;        \
+  }
+
+GET_NOISE_VAR(uint8_t, lowbd)
+GET_NOISE_VAR(uint16_t, highbd)
+
+static INLINE double get_noise_var(const uint8_t *data, const uint8_t *denoised,
+                                   int w, int h, int stride, int x_o, int y_o,
+                                   int block_size_x, int block_size_y,
+                                   int use_highbd) {
+  if (use_highbd)
+    return get_noise_var_highbd((const uint16_t *)data,
+                                (const uint16_t *)denoised, w, h, stride, x_o,
+                                y_o, block_size_x, block_size_y);
+  return get_noise_var_lowbd(data, denoised, w, h, stride, x_o, y_o,
+                             block_size_x, block_size_y);
+}
+
+static void equation_system_clear(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  memset(eqns->A, 0, sizeof(*eqns->A) * n * n);
+  memset(eqns->x, 0, sizeof(*eqns->x) * n);
+  memset(eqns->b, 0, sizeof(*eqns->b) * n);
+}
+
+static void equation_system_copy(aom_equation_system_t *dst,
+                                 const aom_equation_system_t *src) {
+  const int n = dst->n;
+  memcpy(dst->A, src->A, sizeof(*dst->A) * n * n);
+  memcpy(dst->x, src->x, sizeof(*dst->x) * n);
+  memcpy(dst->b, src->b, sizeof(*dst->b) * n);
+}
+
+static int equation_system_init(aom_equation_system_t *eqns, int n) {
+  eqns->A = (double *)aom_malloc(sizeof(*eqns->A) * n * n);
+  eqns->b = (double *)aom_malloc(sizeof(*eqns->b) * n);
+  eqns->x = (double *)aom_malloc(sizeof(*eqns->x) * n);
+  eqns->n = n;
+  if (!eqns->A || !eqns->b || !eqns->x) {
+    fprintf(stderr, "Failed to allocate system of equations of size %d\n", n);
+    aom_free(eqns->A);
+    aom_free(eqns->b);
+    aom_free(eqns->x);
+    memset(eqns, 0, sizeof(*eqns));
+    return 0;
+  }
+  equation_system_clear(eqns);
+  return 1;
+}
+
+static int equation_system_solve(aom_equation_system_t *eqns) {
+  const int n = eqns->n;
+  double *b = (double *)aom_malloc(sizeof(*b) * n);
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  int ret = 0;
+  if (A == NULL || b == NULL) {
+    fprintf(stderr, "Unable to allocate temp values of size %dx%d\n", n, n);
+    aom_free(b);
+    aom_free(A);
+    return 0;
+  }
+  memcpy(A, eqns->A, sizeof(*eqns->A) * n * n);
+  memcpy(b, eqns->b, sizeof(*eqns->b) * n);
+  ret = linsolve(n, A, eqns->n, b, eqns->x);
+  aom_free(b);
+  aom_free(A);
+
+  if (ret == 0) {
+    return 0;
+  }
+  return 1;
+}
+
+static void equation_system_add(aom_equation_system_t *dest,
+                                aom_equation_system_t *src) {
+  const int n = dest->n;
+  int i, j;
+  for (i = 0; i < n; ++i) {
+    for (j = 0; j < n; ++j) {
+      dest->A[i * n + j] += src->A[i * n + j];
+    }
+    dest->b[i] += src->b[i];
+  }
+}
+
+static void equation_system_free(aom_equation_system_t *eqns) {
+  if (!eqns) return;
+  aom_free(eqns->A);
+  aom_free(eqns->b);
+  aom_free(eqns->x);
+  memset(eqns, 0, sizeof(*eqns));
+}
+
+static void noise_strength_solver_clear(aom_noise_strength_solver_t *solver) {
+  equation_system_clear(&solver->eqns);
+  solver->num_equations = 0;
+  solver->total = 0;
+}
+
+static void noise_strength_solver_add(aom_noise_strength_solver_t *dest,
+                                      aom_noise_strength_solver_t *src) {
+  equation_system_add(&dest->eqns, &src->eqns);
+  dest->num_equations += src->num_equations;
+  dest->total += src->total;
+}
+
+// Return the number of coefficients required for the given parameters
+static int num_coeffs(const aom_noise_model_params_t params) {
+  const int n = 2 * params.lag + 1;
+  switch (params.shape) {
+    case AOM_NOISE_SHAPE_DIAMOND: return params.lag * (params.lag + 1);
+    case AOM_NOISE_SHAPE_SQUARE: return (n * n) / 2;
+  }
+  return 0;
+}
+
+static int noise_state_init(aom_noise_state_t *state, int n, int bit_depth) {
+  const int kNumBins = 20;
+  if (!equation_system_init(&state->eqns, n)) {
+    fprintf(stderr, "Failed initialization noise state with size %d\n", n);
+    return 0;
+  }
+  state->ar_gain = 1.0;
+  state->num_observations = 0;
+  return aom_noise_strength_solver_init(&state->strength_solver, kNumBins,
+                                        bit_depth);
+}
+
+static void set_chroma_coefficient_fallback_soln(aom_equation_system_t *eqns) {
+  const double kTolerance = 1e-6;
+  const int last = eqns->n - 1;
+  // Set all of the AR coefficients to zero, but try to solve for correlation
+  // with the luma channel
+  memset(eqns->x, 0, sizeof(*eqns->x) * eqns->n);
+  if (fabs(eqns->A[last * eqns->n + last]) > kTolerance) {
+    eqns->x[last] = eqns->b[last] / eqns->A[last * eqns->n + last];
+  }
+}
+
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points) {
+  if (!lut) return 0;
+  if (num_points <= 0) return 0;
+  lut->num_points = 0;
+  lut->points = (double(*)[2])aom_malloc(num_points * sizeof(*lut->points));
+  if (!lut->points) return 0;
+  lut->num_points = num_points;
+  memset(lut->points, 0, sizeof(*lut->points) * num_points);
+  return 1;
+}
+
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut) {
+  if (!lut) return;
+  aom_free(lut->points);
+  memset(lut, 0, sizeof(*lut));
+}
+
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x) {
+  int i = 0;
+  // Constant extrapolation for x <  x_0.
+  if (x < lut->points[0][0]) return lut->points[0][1];
+  for (i = 0; i < lut->num_points - 1; ++i) {
+    if (x >= lut->points[i][0] && x <= lut->points[i + 1][0]) {
+      const double a =
+          (x - lut->points[i][0]) / (lut->points[i + 1][0] - lut->points[i][0]);
+      return lut->points[i + 1][1] * a + lut->points[i][1] * (1.0 - a);
+    }
+  }
+  // Constant extrapolation for x > x_{n-1}
+  return lut->points[lut->num_points - 1][1];
+}
+
+static double noise_strength_solver_get_bin_index(
+    const aom_noise_strength_solver_t *solver, double value) {
+  const double val =
+      fclamp(value, solver->min_intensity, solver->max_intensity);
+  const double range = solver->max_intensity - solver->min_intensity;
+  return (solver->num_bins - 1) * (val - solver->min_intensity) / range;
+}
+
+static double noise_strength_solver_get_value(
+    const aom_noise_strength_solver_t *solver, double x) {
+  const double bin = noise_strength_solver_get_bin_index(solver, x);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  return (1.0 - a) * solver->eqns.x[bin_i0] + a * solver->eqns.x[bin_i1];
+}
+
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std) {
+  const double bin = noise_strength_solver_get_bin_index(solver, block_mean);
+  const int bin_i0 = (int)floor(bin);
+  const int bin_i1 = AOMMIN(solver->num_bins - 1, bin_i0 + 1);
+  const double a = bin - bin_i0;
+  const int n = solver->num_bins;
+  solver->eqns.A[bin_i0 * n + bin_i0] += (1.0 - a) * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i0] += a * (1.0 - a);
+  solver->eqns.A[bin_i1 * n + bin_i1] += a * a;
+  solver->eqns.A[bin_i0 * n + bin_i1] += a * (1.0 - a);
+  solver->eqns.b[bin_i0] += (1.0 - a) * noise_std;
+  solver->eqns.b[bin_i1] += a * noise_std;
+  solver->total += noise_std;
+  solver->num_equations++;
+}
+
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver) {
+  // Add regularization proportional to the number of constraints
+  const int n = solver->num_bins;
+  const double kAlpha = 2.0 * (double)(solver->num_equations) / n;
+  int result = 0;
+  double mean = 0;
+
+  // Do this in a non-destructive manner so it is not confusing to the caller
+  double *old_A = solver->eqns.A;
+  double *A = (double *)aom_malloc(sizeof(*A) * n * n);
+  if (!A) {
+    fprintf(stderr, "Unable to allocate copy of A\n");
+    return 0;
+  }
+  memcpy(A, old_A, sizeof(*A) * n * n);
+
+  for (int i = 0; i < n; ++i) {
+    const int i_lo = AOMMAX(0, i - 1);
+    const int i_hi = AOMMIN(n - 1, i + 1);
+    A[i * n + i_lo] -= kAlpha;
+    A[i * n + i] += 2 * kAlpha;
+    A[i * n + i_hi] -= kAlpha;
+  }
+
+  // Small regularization to give average noise strength
+  mean = solver->total / solver->num_equations;
+  for (int i = 0; i < n; ++i) {
+    A[i * n + i] += 1.0 / 8192.;
+    solver->eqns.b[i] += mean / 8192.;
+  }
+  solver->eqns.A = A;
+  result = equation_system_solve(&solver->eqns);
+  solver->eqns.A = old_A;
+
+  aom_free(A);
+  return result;
+}
+
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth) {
+  if (!solver) return 0;
+  memset(solver, 0, sizeof(*solver));
+  solver->num_bins = num_bins;
+  solver->min_intensity = 0;
+  solver->max_intensity = (1 << bit_depth) - 1;
+  solver->total = 0;
+  solver->num_equations = 0;
+  return equation_system_init(&solver->eqns, num_bins);
+}
+
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver) {
+  if (!solver) return;
+  equation_system_free(&solver->eqns);
+}
+
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i) {
+  const double range = solver->max_intensity - solver->min_intensity;
+  const int n = solver->num_bins;
+  return ((double)i) / (n - 1) * range + solver->min_intensity;
+}
+
+// Computes the residual if a point were to be removed from the lut. This is
+// calculated as the area between the output of the solver and the line segment
+// that would be formed between [x_{i - 1}, x_{i + 1}).
+static void update_piecewise_linear_residual(
+    const aom_noise_strength_solver_t *solver,
+    const aom_noise_strength_lut_t *lut, double *residual, int start, int end) {
+  const double dx = 255. / solver->num_bins;
+  for (int i = AOMMAX(start, 1); i < AOMMIN(end, lut->num_points - 1); ++i) {
+    const int lower = AOMMAX(0, (int)floor(noise_strength_solver_get_bin_index(
+                                    solver, lut->points[i - 1][0])));
+    const int upper = AOMMIN(solver->num_bins - 1,
+                             (int)ceil(noise_strength_solver_get_bin_index(
+                                 solver, lut->points[i + 1][0])));
+    double r = 0;
+    for (int j = lower; j <= upper; ++j) {
+      const double x = aom_noise_strength_solver_get_center(solver, j);
+      if (x < lut->points[i - 1][0]) continue;
+      if (x >= lut->points[i + 1][0]) continue;
+      const double y = solver->eqns.x[j];
+      const double a = (x - lut->points[i - 1][0]) /
+                       (lut->points[i + 1][0] - lut->points[i - 1][0]);
+      const double estimate_y =
+          lut->points[i - 1][1] * (1.0 - a) + lut->points[i + 1][1] * a;
+      r += fabs(y - estimate_y);
+    }
+    residual[i] = r * dx;
+  }
+}
+
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_output_points,
+    aom_noise_strength_lut_t *lut) {
+  // The tolerance is normalized to be give consistent results between
+  // different bit-depths.
+  const double kTolerance = solver->max_intensity * 0.00625 / 255.0;
+  if (!aom_noise_strength_lut_init(lut, solver->num_bins)) {
+    fprintf(stderr, "Failed to init lut\n");
+    return 0;
+  }
+  for (int i = 0; i < solver->num_bins; ++i) {
+    lut->points[i][0] = aom_noise_strength_solver_get_center(solver, i);
+    lut->points[i][1] = solver->eqns.x[i];
+  }
+  if (max_output_points < 0) {
+    max_output_points = solver->num_bins;
+  }
+
+  double *residual = (double *)aom_malloc(solver->num_bins * sizeof(*residual));
+  if (!residual) {
+    aom_noise_strength_lut_free(lut);
+    return 0;
+  }
+  memset(residual, 0, sizeof(*residual) * solver->num_bins);
+
+  update_piecewise_linear_residual(solver, lut, residual, 0, solver->num_bins);
+
+  // Greedily remove points if there are too many or if it doesn't hurt local
+  // approximation (never remove the end points)
+  while (lut->num_points > 2) {
+    int min_index = 1;
+    for (int j = 1; j < lut->num_points - 1; ++j) {
+      if (residual[j] < residual[min_index]) {
+        min_index = j;
+      }
+    }
+    const double dx =
+        lut->points[min_index + 1][0] - lut->points[min_index - 1][0];
+    const double avg_residual = residual[min_index] / dx;
+    if (lut->num_points <= max_output_points && avg_residual > kTolerance) {
+      break;
+    }
+
+    const int num_remaining = lut->num_points - min_index - 1;
+    memmove(lut->points + min_index, lut->points + min_index + 1,
+            sizeof(lut->points[0]) * num_remaining);
+    lut->num_points--;
+
+    update_piecewise_linear_residual(solver, lut, residual, min_index - 1,
+                                     min_index + 1);
+  }
+  aom_free(residual);
+  return 1;
+}
+
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd) {
+  const int n = block_size * block_size;
+  aom_equation_system_t eqns;
+  double *AtA_inv = 0;
+  double *A = 0;
+  int x = 0, y = 0, i = 0, j = 0;
+  block_finder->A = NULL;
+  block_finder->AtA_inv = NULL;
+
+  if (!equation_system_init(&eqns, kLowPolyNumParams)) {
+    fprintf(stderr, "Failed to init equation system for block_size=%d\n",
+            block_size);
+    return 0;
+  }
+
+  AtA_inv = (double *)aom_malloc(kLowPolyNumParams * kLowPolyNumParams *
+                                 sizeof(*AtA_inv));
+  A = (double *)aom_malloc(kLowPolyNumParams * n * sizeof(*A));
+  if (AtA_inv == NULL || A == NULL) {
+    fprintf(stderr, "Failed to alloc A or AtA_inv for block_size=%d\n",
+            block_size);
+    aom_free(AtA_inv);
+    aom_free(A);
+    equation_system_free(&eqns);
+    return 0;
+  }
+
+  block_finder->A = A;
+  block_finder->AtA_inv = AtA_inv;
+  block_finder->block_size = block_size;
+  block_finder->normalization = (1 << bit_depth) - 1;
+  block_finder->use_highbd = use_highbd;
+
+  for (y = 0; y < block_size; ++y) {
+    const double yd = ((double)y - block_size / 2.) / (block_size / 2.);
+    for (x = 0; x < block_size; ++x) {
+      const double xd = ((double)x - block_size / 2.) / (block_size / 2.);
+      const double coords[3] = { yd, xd, 1 };
+      const int row = y * block_size + x;
+      A[kLowPolyNumParams * row + 0] = yd;
+      A[kLowPolyNumParams * row + 1] = xd;
+      A[kLowPolyNumParams * row + 2] = 1;
+
+      for (i = 0; i < kLowPolyNumParams; ++i) {
+        for (j = 0; j < kLowPolyNumParams; ++j) {
+          eqns.A[kLowPolyNumParams * i + j] += coords[i] * coords[j];
+        }
+      }
+    }
+  }
+
+  // Lazy inverse using existing equation solver.
+  for (i = 0; i < kLowPolyNumParams; ++i) {
+    memset(eqns.b, 0, sizeof(*eqns.b) * kLowPolyNumParams);
+    eqns.b[i] = 1;
+    equation_system_solve(&eqns);
+
+    for (j = 0; j < kLowPolyNumParams; ++j) {
+      AtA_inv[j * kLowPolyNumParams + i] = eqns.x[j];
+    }
+  }
+  equation_system_free(&eqns);
+  return 1;
+}
+
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder) {
+  if (!block_finder) return;
+  aom_free(block_finder->A);
+  aom_free(block_finder->AtA_inv);
+  memset(block_finder, 0, sizeof(*block_finder));
+}
+
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block) {
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double *A = block_finder->A;
+  const double *AtA_inv = block_finder->AtA_inv;
+  double plane_coords[kLowPolyNumParams];
+  double AtA_inv_b[kLowPolyNumParams];
+  int xi, yi, i;
+
+  if (block_finder->use_highbd) {
+    const uint16_t *const data16 = (const uint16_t *const)data;
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data16[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  } else {
+    for (yi = 0; yi < block_size; ++yi) {
+      const int y = clamp(offsy + yi, 0, h - 1);
+      for (xi = 0; xi < block_size; ++xi) {
+        const int x = clamp(offsx + xi, 0, w - 1);
+        block[yi * block_size + xi] =
+            ((double)data[y * stride + x]) / block_finder->normalization;
+      }
+    }
+  }
+  multiply_mat(block, A, AtA_inv_b, 1, n, kLowPolyNumParams);
+  multiply_mat(AtA_inv, AtA_inv_b, plane_coords, kLowPolyNumParams,
+               kLowPolyNumParams, 1);
+  multiply_mat(A, plane_coords, plane, n, kLowPolyNumParams, 1);
+
+  for (i = 0; i < n; ++i) {
+    block[i] -= plane[i];
+  }
+}
+
+typedef struct {
+  int index;
+  float score;
+} index_and_score_t;
+
+static int compare_scores(const void *a, const void *b) {
+  const float diff =
+      ((index_and_score_t *)a)->score - ((index_and_score_t *)b)->score;
+  if (diff < 0)
+    return -1;
+  else if (diff > 0)
+    return 1;
+  return 0;
+}
+
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks) {
+  // The gradient-based features used in this code are based on:
+  //  A. Kokaram, D. Kelly, H. Denman and A. Crawford, "Measuring noise
+  //  correlation for improved video denoising," 2012 19th, ICIP.
+  // The thresholds are more lenient to allow for correct grain modeling
+  // if extreme cases.
+  const int block_size = block_finder->block_size;
+  const int n = block_size * block_size;
+  const double kTraceThreshold = 0.15 / (32 * 32);
+  const double kRatioThreshold = 1.25;
+  const double kNormThreshold = 0.08 / (32 * 32);
+  const double kVarThreshold = 0.005 / (double)n;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int num_flat = 0;
+  double *plane = (double *)aom_malloc(n * sizeof(*plane));
+  double *block = (double *)aom_malloc(n * sizeof(*block));
+  index_and_score_t *scores = (index_and_score_t *)aom_malloc(
+      num_blocks_w * num_blocks_h * sizeof(*scores));
+  if (plane == NULL || block == NULL || scores == NULL) {
+    fprintf(stderr, "Failed to allocate memory for block of size %d\n", n);
+    aom_free(plane);
+    aom_free(block);
+    aom_free(scores);
+    return -1;
+  }
+
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "score = [");
+#endif
+  for (int by = 0; by < num_blocks_h; ++by) {
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      // Compute gradient covariance matrix.
+      aom_flat_block_finder_extract_block(block_finder, data, w, h, stride,
+                                          bx * block_size, by * block_size,
+                                          plane, block);
+      double Gxx = 0, Gxy = 0, Gyy = 0;
+      double mean = 0;
+      double var = 0;
+
+      for (int yi = 1; yi < block_size - 1; ++yi) {
+        for (int xi = 1; xi < block_size - 1; ++xi) {
+          const double gx = (block[yi * block_size + xi + 1] -
+                             block[yi * block_size + xi - 1]) /
+                            2;
+          const double gy = (block[yi * block_size + xi + block_size] -
+                             block[yi * block_size + xi - block_size]) /
+                            2;
+          Gxx += gx * gx;
+          Gxy += gx * gy;
+          Gyy += gy * gy;
+
+          const double value = block[yi * block_size + xi];
+          mean += value;
+          var += value * value;
+        }
+      }
+      mean /= (block_size - 2) * (block_size - 2);
+
+      // Normalize gradients by block_size.
+      Gxx /= ((block_size - 2) * (block_size - 2));
+      Gxy /= ((block_size - 2) * (block_size - 2));
+      Gyy /= ((block_size - 2) * (block_size - 2));
+      var = var / ((block_size - 2) * (block_size - 2)) - mean * mean;
+
+      {
+        const double trace = Gxx + Gyy;
+        const double det = Gxx * Gyy - Gxy * Gxy;
+        const double e1 = (trace + sqrt(trace * trace - 4 * det)) / 2.;
+        const double e2 = (trace - sqrt(trace * trace - 4 * det)) / 2.;
+        const double norm = e1;  // Spectral norm
+        const double ratio = (e1 / AOMMAX(e2, 1e-6));
+        const int is_flat = (trace < kTraceThreshold) &&
+                            (ratio < kRatioThreshold) &&
+                            (norm < kNormThreshold) && (var > kVarThreshold);
+        // The following weights are used to combine the above features to give
+        // a sigmoid score for flatness. If the input was normalized to [0,100]
+        // the magnitude of these values would be close to 1 (e.g., weights
+        // corresponding to variance would be a factor of 10000x smaller).
+        // The weights are given in the following order:
+        //    [{var}, {ratio}, {trace}, {norm}, offset]
+        // with one of the most discriminative being simply the variance.
+        const double weights[5] = { -6682, -0.2056, 13087, -12434, 2.5694 };
+        double sum_weights = weights[0] * var + weights[1] * ratio +
+                             weights[2] * trace + weights[3] * norm +
+                             weights[4];
+        // clamp the value to [-25.0, 100.0] to prevent overflow
+        sum_weights = fclamp(sum_weights, -25.0, 100.0);
+        const float score = (float)(1.0 / (1 + exp(-sum_weights)));
+        flat_blocks[by * num_blocks_w + bx] = is_flat ? 255 : 0;
+        scores[by * num_blocks_w + bx].score = var > kVarThreshold ? score : 0;
+        scores[by * num_blocks_w + bx].index = by * num_blocks_w + bx;
+#ifdef NOISE_MODEL_LOG_SCORE
+        fprintf(stderr, "%g %g %g %g %g %d ", score, var, ratio, trace, norm,
+                is_flat);
+#endif
+        num_flat += is_flat;
+      }
+    }
+#ifdef NOISE_MODEL_LOG_SCORE
+    fprintf(stderr, "\n");
+#endif
+  }
+#ifdef NOISE_MODEL_LOG_SCORE
+  fprintf(stderr, "];\n");
+#endif
+  // Find the top-scored blocks (most likely to be flat) and set the flat blocks
+  // be the union of the thresholded results and the top 10th percentile of the
+  // scored results.
+  qsort(scores, num_blocks_w * num_blocks_h, sizeof(*scores), &compare_scores);
+  const int top_nth_percentile = num_blocks_w * num_blocks_h * 90 / 100;
+  const float score_threshold = scores[top_nth_percentile].score;
+  for (int i = 0; i < num_blocks_w * num_blocks_h; ++i) {
+    if (scores[i].score >= score_threshold) {
+      num_flat += flat_blocks[scores[i].index] == 0;
+      flat_blocks[scores[i].index] |= 1;
+    }
+  }
+  aom_free(block);
+  aom_free(plane);
+  aom_free(scores);
+  return num_flat;
+}
+
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params) {
+  const int n = num_coeffs(params);
+  const int lag = params.lag;
+  const int bit_depth = params.bit_depth;
+  int x = 0, y = 0, i = 0, c = 0;
+
+  memset(model, 0, sizeof(*model));
+  if (params.lag < 1) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be >= 1\n", params.lag);
+    return 0;
+  }
+  if (params.lag > kMaxLag) {
+    fprintf(stderr, "Invalid noise param: lag = %d must be <= %d\n", params.lag,
+            kMaxLag);
+    return 0;
+  }
+  if (!(params.bit_depth == 8 || params.bit_depth == 10 ||
+        params.bit_depth == 12)) {
+    return 0;
+  }
+
+  memcpy(&model->params, &params, sizeof(params));
+  for (c = 0; c < 3; ++c) {
+    if (!noise_state_init(&model->combined_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+    if (!noise_state_init(&model->latest_state[c], n + (c > 0), bit_depth)) {
+      fprintf(stderr, "Failed to allocate noise state for channel %d\n", c);
+      aom_noise_model_free(model);
+      return 0;
+    }
+  }
+  model->n = n;
+  model->coords = (int(*)[2])aom_malloc(sizeof(*model->coords) * n);
+  if (!model->coords) {
+    aom_noise_model_free(model);
+    return 0;
+  }
+
+  for (y = -lag; y <= 0; ++y) {
+    const int max_x = y == 0 ? -1 : lag;
+    for (x = -lag; x <= max_x; ++x) {
+      switch (params.shape) {
+        case AOM_NOISE_SHAPE_DIAMOND:
+          if (abs(x) <= y + lag) {
+            model->coords[i][0] = x;
+            model->coords[i][1] = y;
+            ++i;
+          }
+          break;
+        case AOM_NOISE_SHAPE_SQUARE:
+          model->coords[i][0] = x;
+          model->coords[i][1] = y;
+          ++i;
+          break;
+        default:
+          fprintf(stderr, "Invalid shape\n");
+          aom_noise_model_free(model);
+          return 0;
+      }
+    }
+  }
+  assert(i == n);
+  return 1;
+}
+
+void aom_noise_model_free(aom_noise_model_t *model) {
+  int c = 0;
+  if (!model) return;
+
+  aom_free(model->coords);
+  for (c = 0; c < 3; ++c) {
+    equation_system_free(&model->latest_state[c].eqns);
+    equation_system_free(&model->combined_state[c].eqns);
+
+    equation_system_free(&model->latest_state[c].strength_solver.eqns);
+    equation_system_free(&model->combined_state[c].strength_solver.eqns);
+  }
+  memset(model, 0, sizeof(*model));
+}
+
+// Extracts the neighborhood defined by coords around point (x, y) from
+// the difference between the data and denoised images. Also extracts the
+// entry (possibly downsampled) for (x, y) in the alt_data (e.g., luma).
+#define EXTRACT_AR_ROW(INT_TYPE, suffix)                                   \
+  static double extract_ar_row_##suffix(                                   \
+      int(*coords)[2], int num_coords, const INT_TYPE *const data,         \
+      const INT_TYPE *const denoised, int stride, int sub_log2[2],         \
+      const INT_TYPE *const alt_data, const INT_TYPE *const alt_denoised,  \
+      int alt_stride, int x, int y, double *buffer) {                      \
+    for (int i = 0; i < num_coords; ++i) {                                 \
+      const int x_i = x + coords[i][0], y_i = y + coords[i][1];            \
+      buffer[i] =                                                          \
+          (double)data[y_i * stride + x_i] - denoised[y_i * stride + x_i]; \
+    }                                                                      \
+    const double val =                                                     \
+        (double)data[y * stride + x] - denoised[y * stride + x];           \
+                                                                           \
+    if (alt_data && alt_denoised) {                                        \
+      double avg_data = 0, avg_denoised = 0;                               \
+      int num_samples = 0;                                                 \
+      for (int dy_i = 0; dy_i < (1 << sub_log2[1]); dy_i++) {              \
+        const int y_up = (y << sub_log2[1]) + dy_i;                        \
+        for (int dx_i = 0; dx_i < (1 << sub_log2[0]); dx_i++) {            \
+          const int x_up = (x << sub_log2[0]) + dx_i;                      \
+          avg_data += alt_data[y_up * alt_stride + x_up];                  \
+          avg_denoised += alt_denoised[y_up * alt_stride + x_up];          \
+          num_samples++;                                                   \
+        }                                                                  \
+      }                                                                    \
+      buffer[num_coords] = (avg_data - avg_denoised) / num_samples;        \
+    }                                                                      \
+    return val;                                                            \
+  }
+
+EXTRACT_AR_ROW(uint8_t, lowbd)
+EXTRACT_AR_ROW(uint16_t, highbd)
+
+static int add_block_observations(
+    aom_noise_model_t *noise_model, int c, const uint8_t *const data,
+    const uint8_t *const denoised, int w, int h, int stride, int sub_log2[2],
+    const uint8_t *const alt_data, const uint8_t *const alt_denoised,
+    int alt_stride, const uint8_t *const flat_blocks, int block_size,
+    int num_blocks_w, int num_blocks_h) {
+  const int lag = noise_model->params.lag;
+  const int num_coords = noise_model->n;
+  const double normalization = (1 << noise_model->params.bit_depth) - 1;
+  double *A = noise_model->latest_state[c].eqns.A;
+  double *b = noise_model->latest_state[c].eqns.b;
+  double *buffer = (double *)aom_malloc(sizeof(*buffer) * (num_coords + 1));
+  const int n = noise_model->latest_state[c].eqns.n;
+
+  if (!buffer) {
+    fprintf(stderr, "Unable to allocate buffer of size %d\n", num_coords + 1);
+    return 0;
+  }
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      int y_start =
+          (by > 0 && flat_blocks[(by - 1) * num_blocks_w + bx]) ? 0 : lag;
+      int x_start =
+          (bx > 0 && flat_blocks[by * num_blocks_w + bx - 1]) ? 0 : lag;
+      int y_end = AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                         block_size >> sub_log2[1]);
+      int x_end = AOMMIN(
+          (w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]) - lag,
+          (bx + 1 < num_blocks_w && flat_blocks[by * num_blocks_w + bx + 1])
+              ? (block_size >> sub_log2[0])
+              : ((block_size >> sub_log2[0]) - lag));
+      for (int y = y_start; y < y_end; ++y) {
+        for (int x = x_start; x < x_end; ++x) {
+          const double val =
+              noise_model->params.use_highbd
+                  ? extract_ar_row_highbd(noise_model->coords, num_coords,
+                                          (const uint16_t *const)data,
+                                          (const uint16_t *const)denoised,
+                                          stride, sub_log2,
+                                          (const uint16_t *const)alt_data,
+                                          (const uint16_t *const)alt_denoised,
+                                          alt_stride, x + x_o, y + y_o, buffer)
+                  : extract_ar_row_lowbd(noise_model->coords, num_coords, data,
+                                         denoised, stride, sub_log2, alt_data,
+                                         alt_denoised, alt_stride, x + x_o,
+                                         y + y_o, buffer);
+          for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+              A[i * n + j] +=
+                  (buffer[i] * buffer[j]) / (normalization * normalization);
+            }
+            b[i] += (buffer[i] * val) / (normalization * normalization);
+          }
+          noise_model->latest_state[c].num_observations++;
+        }
+      }
+    }
+  }
+  aom_free(buffer);
+  return 1;
+}
+
+static void add_noise_std_observations(
+    aom_noise_model_t *noise_model, int c, const double *coeffs,
+    const uint8_t *const data, const uint8_t *const denoised, int w, int h,
+    int stride, int sub_log2[2], const uint8_t *const alt_data, int alt_stride,
+    const uint8_t *const flat_blocks, int block_size, int num_blocks_w,
+    int num_blocks_h) {
+  const int num_coords = noise_model->n;
+  aom_noise_strength_solver_t *noise_strength_solver =
+      &noise_model->latest_state[c].strength_solver;
+
+  const aom_noise_strength_solver_t *noise_strength_luma =
+      &noise_model->latest_state[0].strength_solver;
+  const double luma_gain = noise_model->latest_state[0].ar_gain;
+  const double noise_gain = noise_model->latest_state[c].ar_gain;
+  for (int by = 0; by < num_blocks_h; ++by) {
+    const int y_o = by * (block_size >> sub_log2[1]);
+    for (int bx = 0; bx < num_blocks_w; ++bx) {
+      const int x_o = bx * (block_size >> sub_log2[0]);
+      if (!flat_blocks[by * num_blocks_w + bx]) {
+        continue;
+      }
+      const int num_samples_h =
+          AOMMIN((h >> sub_log2[1]) - by * (block_size >> sub_log2[1]),
+                 block_size >> sub_log2[1]);
+      const int num_samples_w =
+          AOMMIN((w >> sub_log2[0]) - bx * (block_size >> sub_log2[0]),
+                 (block_size >> sub_log2[0]));
+      // Make sure that we have a reasonable amount of samples to consider the
+      // block
+      if (num_samples_w * num_samples_h > block_size) {
+        const double block_mean = get_block_mean(
+            alt_data ? alt_data : data, w, h, alt_data ? alt_stride : stride,
+            x_o << sub_log2[0], y_o << sub_log2[1], block_size,
+            noise_model->params.use_highbd);
+        const double noise_var = get_noise_var(
+            data, denoised, stride, w >> sub_log2[0], h >> sub_log2[1], x_o,
+            y_o, block_size >> sub_log2[0], block_size >> sub_log2[1],
+            noise_model->params.use_highbd);
+        // We want to remove the part of the noise that came from being
+        // correlated with luma. Note that the noise solver for luma must
+        // have already been run.
+        const double luma_strength =
+            c > 0 ? luma_gain * noise_strength_solver_get_value(
+                                    noise_strength_luma, block_mean)
+                  : 0;
+        const double corr = c > 0 ? coeffs[num_coords] : 0;
+        // Chroma noise:
+        //    N(0, noise_var) = N(0, uncorr_var) + corr * N(0, luma_strength^2)
+        // The uncorrelated component:
+        //   uncorr_var = noise_var - (corr * luma_strength)^2
+        // But don't allow fully correlated noise (hence the max), since the
+        // synthesis cannot model it.
+        const double uncorr_std = sqrt(
+            AOMMAX(noise_var / 16, noise_var - pow(corr * luma_strength, 2)));
+        // After we've removed correlation with luma, undo the gain that will
+        // come from running the IIR filter.
+        const double adjusted_strength = uncorr_std / noise_gain;
+        aom_noise_strength_solver_add_measurement(
+            noise_strength_solver, block_mean, adjusted_strength);
+      }
+    }
+  }
+}
+
+// Return true if the noise estimate appears to be different from the combined
+// (multi-frame) estimate. The difference is measured by checking whether the
+// AR coefficients have diverged (using a threshold on normalized cross
+// correlation), or whether the noise strength has changed.
+static int is_noise_model_different(aom_noise_model_t *const noise_model) {
+  // These thresholds are kind of arbitrary and will likely need further tuning
+  // (or exported as parameters). The threshold on noise strength is a weighted
+  // difference between the noise strength histograms
+  const double kCoeffThreshold = 0.9;
+  const double kStrengthThreshold =
+      0.005 * (1 << (noise_model->params.bit_depth - 8));
+  for (int c = 0; c < 1; ++c) {
+    const double corr =
+        aom_normalized_cross_correlation(noise_model->latest_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.x,
+                                         noise_model->combined_state[c].eqns.n);
+    if (corr < kCoeffThreshold) return 1;
+
+    const double dx =
+        1.0 / noise_model->latest_state[c].strength_solver.num_bins;
+
+    const aom_equation_system_t *latest_eqns =
+        &noise_model->latest_state[c].strength_solver.eqns;
+    const aom_equation_system_t *combined_eqns =
+        &noise_model->combined_state[c].strength_solver.eqns;
+    double diff = 0;
+    double total_weight = 0;
+    for (int j = 0; j < latest_eqns->n; ++j) {
+      double weight = 0;
+      for (int i = 0; i < latest_eqns->n; ++i) {
+        weight += latest_eqns->A[i * latest_eqns->n + j];
+      }
+      weight = sqrt(weight);
+      diff += weight * fabs(latest_eqns->x[j] - combined_eqns->x[j]);
+      total_weight += weight;
+    }
+    if (diff * dx / total_weight > kStrengthThreshold) return 1;
+  }
+  return 0;
+}
+
+static int ar_equation_system_solve(aom_noise_state_t *state, int is_chroma) {
+  const int ret = equation_system_solve(&state->eqns);
+  state->ar_gain = 1.0;
+  if (!ret) return ret;
+
+  // Update the AR gain from the equation system as it will be used to fit
+  // the noise strength as a function of intensity.  In the Yule-Walker
+  // equations, the diagonal should be the variance of the correlated noise.
+  // In the case of the least squares estimate, there will be some variability
+  // in the diagonal. So use the mean of the diagonal as the estimate of
+  // overall variance (this works for least squares or Yule-Walker formulation).
+  double var = 0;
+  const int n = state->eqns.n;
+  for (int i = 0; i < (state->eqns.n - is_chroma); ++i) {
+    var += state->eqns.A[i * n + i] / state->num_observations;
+  }
+  var /= (n - is_chroma);
+
+  // Keep track of E(Y^2) = <b, x> + E(X^2)
+  // In the case that we are using chroma and have an estimate of correlation
+  // with luma we adjust that estimate slightly to remove the correlated bits by
+  // subtracting out the last column of a scaled by our correlation estimate
+  // from b. E(y^2) = <b - A(:, end)*x(end), x>
+  double sum_covar = 0;
+  for (int i = 0; i < state->eqns.n - is_chroma; ++i) {
+    double bi = state->eqns.b[i];
+    if (is_chroma) {
+      bi -= state->eqns.A[i * n + (n - 1)] * state->eqns.x[n - 1];
+    }
+    sum_covar += (bi * state->eqns.x[i]) / state->num_observations;
+  }
+  // Now, get an estimate of the variance of uncorrelated noise signal and use
+  // it to determine the gain of the AR filter.
+  const double noise_var = AOMMAX(var - sum_covar, 1e-6);
+  state->ar_gain = AOMMAX(1, sqrt(AOMMAX(var / noise_var, 1e-6)));
+  return ret;
+}
+
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int stride[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size) {
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  int y_model_different = 0;
+  int num_blocks = 0;
+  int i = 0, channel = 0;
+
+  if (block_size <= 1) {
+    fprintf(stderr, "block_size = %d must be > 1\n", block_size);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  if (block_size < noise_model->params.lag * 2 + 1) {
+    fprintf(stderr, "block_size = %d must be >= %d\n", block_size,
+            noise_model->params.lag * 2 + 1);
+    return AOM_NOISE_STATUS_INVALID_ARGUMENT;
+  }
+
+  // Clear the latest equation system
+  for (i = 0; i < 3; ++i) {
+    equation_system_clear(&noise_model->latest_state[i].eqns);
+    noise_model->latest_state[i].num_observations = 0;
+    noise_strength_solver_clear(&noise_model->latest_state[i].strength_solver);
+  }
+
+  // Check that we have enough flat blocks
+  for (i = 0; i < num_blocks_h * num_blocks_w; ++i) {
+    if (flat_blocks[i]) {
+      num_blocks++;
+    }
+  }
+
+  if (num_blocks <= 1) {
+    fprintf(stderr, "Not enough flat blocks to update noise estimate\n");
+    return AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS;
+  }
+
+  for (channel = 0; channel < 3; ++channel) {
+    int no_subsampling[2] = { 0, 0 };
+    const uint8_t *alt_data = channel > 0 ? data[0] : 0;
+    const uint8_t *alt_denoised = channel > 0 ? denoised[0] : 0;
+    int *sub = channel > 0 ? chroma_sub_log2 : no_subsampling;
+    const int is_chroma = channel != 0;
+    if (!data[channel] || !denoised[channel]) break;
+    if (!add_block_observations(noise_model, channel, data[channel],
+                                denoised[channel], w, h, stride[channel], sub,
+                                alt_data, alt_denoised, stride[0], flat_blocks,
+                                block_size, num_blocks_w, num_blocks_h)) {
+      fprintf(stderr, "Adding block observation failed\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    if (!ar_equation_system_solve(&noise_model->latest_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->latest_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving latest noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    add_noise_std_observations(
+        noise_model, channel, noise_model->latest_state[channel].eqns.x,
+        data[channel], denoised[channel], w, h, stride[channel], sub, alt_data,
+        stride[0], flat_blocks, block_size, num_blocks_w, num_blocks_h);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->latest_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving latest noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+
+    // Check noise characteristics and return if error.
+    if (channel == 0 &&
+        noise_model->combined_state[channel].strength_solver.num_equations >
+            0 &&
+        is_noise_model_different(noise_model)) {
+      y_model_different = 1;
+    }
+
+    // Don't update the combined stats if the y model is different.
+    if (y_model_different) continue;
+
+    noise_model->combined_state[channel].num_observations +=
+        noise_model->latest_state[channel].num_observations;
+    equation_system_add(&noise_model->combined_state[channel].eqns,
+                        &noise_model->latest_state[channel].eqns);
+    if (!ar_equation_system_solve(&noise_model->combined_state[channel],
+                                  is_chroma)) {
+      if (is_chroma) {
+        set_chroma_coefficient_fallback_soln(
+            &noise_model->combined_state[channel].eqns);
+      } else {
+        fprintf(stderr, "Solving combined noise equation system failed %d!\n",
+                channel);
+        return AOM_NOISE_STATUS_INTERNAL_ERROR;
+      }
+    }
+
+    noise_strength_solver_add(
+        &noise_model->combined_state[channel].strength_solver,
+        &noise_model->latest_state[channel].strength_solver);
+
+    if (!aom_noise_strength_solver_solve(
+            &noise_model->combined_state[channel].strength_solver)) {
+      fprintf(stderr, "Solving combined noise strength failed!\n");
+      return AOM_NOISE_STATUS_INTERNAL_ERROR;
+    }
+  }
+
+  return y_model_different ? AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE
+                           : AOM_NOISE_STATUS_OK;
+}
+
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model) {
+  for (int c = 0; c < 3; c++) {
+    equation_system_copy(&noise_model->combined_state[c].eqns,
+                         &noise_model->latest_state[c].eqns);
+    equation_system_copy(&noise_model->combined_state[c].strength_solver.eqns,
+                         &noise_model->latest_state[c].strength_solver.eqns);
+    noise_model->combined_state[c].strength_solver.num_equations =
+        noise_model->latest_state[c].strength_solver.num_equations;
+    noise_model->combined_state[c].num_observations =
+        noise_model->latest_state[c].num_observations;
+    noise_model->combined_state[c].ar_gain =
+        noise_model->latest_state[c].ar_gain;
+  }
+}
+
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain) {
+  if (noise_model->params.lag > 3) {
+    fprintf(stderr, "params.lag = %d > 3\n", noise_model->params.lag);
+    return 0;
+  }
+  uint16_t random_seed = film_grain->random_seed;
+  memset(film_grain, 0, sizeof(*film_grain));
+  film_grain->random_seed = random_seed;
+
+  film_grain->apply_grain = 1;
+  film_grain->update_parameters = 1;
+
+  film_grain->ar_coeff_lag = noise_model->params.lag;
+
+  // Convert the scaling functions to 8 bit values
+  aom_noise_strength_lut_t scaling_points[3];
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[0].strength_solver, 14,
+          scaling_points + 0)) {
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[1].strength_solver, 10,
+          scaling_points + 1)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    return 0;
+  }
+  if (!aom_noise_strength_solver_fit_piecewise(
+          &noise_model->combined_state[2].strength_solver, 10,
+          scaling_points + 2)) {
+    aom_noise_strength_lut_free(scaling_points + 0);
+    aom_noise_strength_lut_free(scaling_points + 1);
+    return 0;
+  }
+
+  // Both the domain and the range of the scaling functions in the film_grain
+  // are normalized to 8-bit (e.g., they are implicitly scaled during grain
+  // synthesis).
+  const double strength_divisor = 1 << (noise_model->params.bit_depth - 8);
+  double max_scaling_value = 1e-4;
+  for (int c = 0; c < 3; ++c) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      scaling_points[c].points[i][0] =
+          AOMMIN(255, scaling_points[c].points[i][0] / strength_divisor);
+      scaling_points[c].points[i][1] =
+          AOMMIN(255, scaling_points[c].points[i][1] / strength_divisor);
+      max_scaling_value =
+          AOMMAX(scaling_points[c].points[i][1], max_scaling_value);
+    }
+  }
+
+  // Scaling_shift values are in the range [8,11]
+  const int max_scaling_value_log2 =
+      clamp((int)floor(log2(max_scaling_value) + 1), 2, 5);
+  film_grain->scaling_shift = 5 + (8 - max_scaling_value_log2);
+
+  const double scale_factor = 1 << (8 - max_scaling_value_log2);
+  film_grain->num_y_points = scaling_points[0].num_points;
+  film_grain->num_cb_points = scaling_points[1].num_points;
+  film_grain->num_cr_points = scaling_points[2].num_points;
+
+  int(*film_grain_scaling[3])[2] = {
+    film_grain->scaling_points_y,
+    film_grain->scaling_points_cb,
+    film_grain->scaling_points_cr,
+  };
+  for (int c = 0; c < 3; c++) {
+    for (int i = 0; i < scaling_points[c].num_points; ++i) {
+      film_grain_scaling[c][i][0] = (int)(scaling_points[c].points[i][0] + 0.5);
+      film_grain_scaling[c][i][1] = clamp(
+          (int)(scale_factor * scaling_points[c].points[i][1] + 0.5), 0, 255);
+    }
+  }
+  aom_noise_strength_lut_free(scaling_points + 0);
+  aom_noise_strength_lut_free(scaling_points + 1);
+  aom_noise_strength_lut_free(scaling_points + 2);
+
+  // Convert the ar_coeffs into 8-bit values
+  const int n_coeff = noise_model->combined_state[0].eqns.n;
+  double max_coeff = 1e-4, min_coeff = -1e-4;
+  double y_corr[2] = { 0, 0 };
+  double avg_luma_strength = 0;
+  for (int c = 0; c < 3; c++) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      max_coeff = AOMMAX(max_coeff, eqns->x[i]);
+      min_coeff = AOMMIN(min_coeff, eqns->x[i]);
+    }
+    // Since the correlation between luma/chroma was computed in an already
+    // scaled space, we adjust it in the un-scaled space.
+    aom_noise_strength_solver_t *solver =
+        &noise_model->combined_state[c].strength_solver;
+    // Compute a weighted average of the strength for the channel.
+    double average_strength = 0, total_weight = 0;
+    for (int i = 0; i < solver->eqns.n; ++i) {
+      double w = 0;
+      for (int j = 0; j < solver->eqns.n; ++j) {
+        w += solver->eqns.A[i * solver->eqns.n + j];
+      }
+      w = sqrt(w);
+      average_strength += solver->eqns.x[i] * w;
+      total_weight += w;
+    }
+    if (total_weight == 0)
+      average_strength = 1;
+    else
+      average_strength /= total_weight;
+    if (c == 0) {
+      avg_luma_strength = average_strength;
+    } else {
+      y_corr[c - 1] = avg_luma_strength * eqns->x[n_coeff] / average_strength;
+      max_coeff = AOMMAX(max_coeff, y_corr[c - 1]);
+      min_coeff = AOMMIN(min_coeff, y_corr[c - 1]);
+    }
+  }
+  // Shift value: AR coeffs range (values 6-9)
+  // 6: [-2, 2),  7: [-1, 1), 8: [-0.5, 0.5), 9: [-0.25, 0.25)
+  film_grain->ar_coeff_shift =
+      clamp(7 - (int)AOMMAX(1 + floor(log2(max_coeff)), ceil(log2(-min_coeff))),
+            6, 9);
+  double scale_ar_coeff = 1 << film_grain->ar_coeff_shift;
+  int *ar_coeffs[3] = {
+    film_grain->ar_coeffs_y,
+    film_grain->ar_coeffs_cb,
+    film_grain->ar_coeffs_cr,
+  };
+  for (int c = 0; c < 3; ++c) {
+    aom_equation_system_t *eqns = &noise_model->combined_state[c].eqns;
+    for (int i = 0; i < n_coeff; ++i) {
+      ar_coeffs[c][i] =
+          clamp((int)round(scale_ar_coeff * eqns->x[i]), -128, 127);
+    }
+    if (c > 0) {
+      ar_coeffs[c][n_coeff] =
+          clamp((int)round(scale_ar_coeff * y_corr[c - 1]), -128, 127);
+    }
+  }
+
+  // At the moment, the noise modeling code assumes that the chroma scaling
+  // functions are a function of luma.
+  film_grain->cb_mult = 128;       // 8 bits
+  film_grain->cb_luma_mult = 192;  // 8 bits
+  film_grain->cb_offset = 256;     // 9 bits
+
+  film_grain->cr_mult = 128;       // 8 bits
+  film_grain->cr_luma_mult = 192;  // 8 bits
+  film_grain->cr_offset = 256;     // 9 bits
+
+  film_grain->chroma_scaling_from_luma = 0;
+  film_grain->grain_scale_shift = 0;
+  film_grain->overlap_flag = 1;
+  return 1;
+}
+
+static void pointwise_multiply(const float *a, float *b, int n) {
+  for (int i = 0; i < n; ++i) {
+    b[i] *= a[i];
+  }
+}
+
+static float *get_half_cos_window(int block_size) {
+  float *window_function =
+      (float *)aom_malloc(block_size * block_size * sizeof(*window_function));
+  if (!window_function) return NULL;
+  for (int y = 0; y < block_size; ++y) {
+    const double cos_yd = cos((.5 + y) * PI / block_size - PI / 2);
+    for (int x = 0; x < block_size; ++x) {
+      const double cos_xd = cos((.5 + x) * PI / block_size - PI / 2);
+      window_function[y * block_size + x] = (float)(cos_yd * cos_xd);
+    }
+  }
+  return window_function;
+}
+
+#define DITHER_AND_QUANTIZE(INT_TYPE, suffix)                               \
+  static void dither_and_quantize_##suffix(                                 \
+      float *result, int result_stride, INT_TYPE *denoised, int w, int h,   \
+      int stride, int chroma_sub_w, int chroma_sub_h, int block_size,       \
+      float block_normalization) {                                          \
+    for (int y = 0; y < (h >> chroma_sub_h); ++y) {                         \
+      for (int x = 0; x < (w >> chroma_sub_w); ++x) {                       \
+        const int result_idx =                                              \
+            (y + (block_size >> chroma_sub_h)) * result_stride + x +        \
+            (block_size >> chroma_sub_w);                                   \
+        INT_TYPE new_val = (INT_TYPE)AOMMIN(                                \
+            AOMMAX(result[result_idx] * block_normalization + 0.5f, 0),     \
+            block_normalization);                                           \
+        const float err =                                                   \
+            -(((float)new_val) / block_normalization - result[result_idx]); \
+        denoised[y * stride + x] = new_val;                                 \
+        if (x + 1 < (w >> chroma_sub_w)) {                                  \
+          result[result_idx + 1] += err * 7.0f / 16.0f;                     \
+        }                                                                   \
+        if (y + 1 < (h >> chroma_sub_h)) {                                  \
+          if (x > 0) {                                                      \
+            result[result_idx + result_stride - 1] += err * 3.0f / 16.0f;   \
+          }                                                                 \
+          result[result_idx + result_stride] += err * 5.0f / 16.0f;         \
+          if (x + 1 < (w >> chroma_sub_w)) {                                \
+            result[result_idx + result_stride + 1] += err * 1.0f / 16.0f;   \
+          }                                                                 \
+        }                                                                   \
+      }                                                                     \
+    }                                                                       \
+  }
+
+DITHER_AND_QUANTIZE(uint8_t, lowbd)
+DITHER_AND_QUANTIZE(uint16_t, highbd)
+
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd) {
+  float *plane = NULL, *block = NULL, *window_full = NULL,
+        *window_chroma = NULL;
+  double *block_d = NULL, *plane_d = NULL;
+  struct aom_noise_tx_t *tx_full = NULL;
+  struct aom_noise_tx_t *tx_chroma = NULL;
+  const int num_blocks_w = (w + block_size - 1) / block_size;
+  const int num_blocks_h = (h + block_size - 1) / block_size;
+  const int result_stride = (num_blocks_w + 2) * block_size;
+  const int result_height = (num_blocks_h + 2) * block_size;
+  float *result = NULL;
+  int init_success = 1;
+  aom_flat_block_finder_t block_finder_full;
+  aom_flat_block_finder_t block_finder_chroma;
+  const float kBlockNormalization = (float)((1 << bit_depth) - 1);
+  if (chroma_sub[0] != chroma_sub[1]) {
+    fprintf(stderr,
+            "aom_wiener_denoise_2d doesn't handle different chroma "
+            "subsampling\n");
+    return 0;
+  }
+  init_success &= aom_flat_block_finder_init(&block_finder_full, block_size,
+                                             bit_depth, use_highbd);
+  result = (float *)aom_malloc((num_blocks_h + 2) * block_size * result_stride *
+                               sizeof(*result));
+  plane = (float *)aom_malloc(block_size * block_size * sizeof(*plane));
+  block =
+      (float *)aom_memalign(32, 2 * block_size * block_size * sizeof(*block));
+  block_d = (double *)aom_malloc(block_size * block_size * sizeof(*block_d));
+  plane_d = (double *)aom_malloc(block_size * block_size * sizeof(*plane_d));
+  window_full = get_half_cos_window(block_size);
+  tx_full = aom_noise_tx_malloc(block_size);
+
+  if (chroma_sub[0] != 0) {
+    init_success &= aom_flat_block_finder_init(&block_finder_chroma,
+                                               block_size >> chroma_sub[0],
+                                               bit_depth, use_highbd);
+    window_chroma = get_half_cos_window(block_size >> chroma_sub[0]);
+    tx_chroma = aom_noise_tx_malloc(block_size >> chroma_sub[0]);
+  } else {
+    window_chroma = window_full;
+    tx_chroma = tx_full;
+  }
+
+  init_success &= (tx_full != NULL) && (tx_chroma != NULL) && (plane != NULL) &&
+                  (plane_d != NULL) && (block != NULL) && (block_d != NULL) &&
+                  (window_full != NULL) && (window_chroma != NULL) &&
+                  (result != NULL);
+  for (int c = init_success ? 0 : 3; c < 3; ++c) {
+    float *window_function = c == 0 ? window_full : window_chroma;
+    aom_flat_block_finder_t *block_finder = &block_finder_full;
+    const int chroma_sub_h = c > 0 ? chroma_sub[1] : 0;
+    const int chroma_sub_w = c > 0 ? chroma_sub[0] : 0;
+    struct aom_noise_tx_t *tx =
+        (c > 0 && chroma_sub[0] > 0) ? tx_chroma : tx_full;
+    if (!data[c] || !denoised[c]) continue;
+    if (c > 0 && chroma_sub[0] != 0) {
+      block_finder = &block_finder_chroma;
+    }
+    memset(result, 0, sizeof(*result) * result_stride * result_height);
+    // Do overlapped block processing (half overlapped). The block rows can
+    // easily be done in parallel
+    for (int offsy = 0; offsy < (block_size >> chroma_sub_h);
+         offsy += (block_size >> chroma_sub_h) / 2) {
+      for (int offsx = 0; offsx < (block_size >> chroma_sub_w);
+           offsx += (block_size >> chroma_sub_w) / 2) {
+        // Pad the boundary when processing each block-set.
+        for (int by = -1; by < num_blocks_h; ++by) {
+          for (int bx = -1; bx < num_blocks_w; ++bx) {
+            const int pixels_per_block =
+                (block_size >> chroma_sub_w) * (block_size >> chroma_sub_h);
+            aom_flat_block_finder_extract_block(
+                block_finder, data[c], w >> chroma_sub_w, h >> chroma_sub_h,
+                stride[c], bx * (block_size >> chroma_sub_w) + offsx,
+                by * (block_size >> chroma_sub_h) + offsy, plane_d, block_d);
+            for (int j = 0; j < pixels_per_block; ++j) {
+              block[j] = (float)block_d[j];
+              plane[j] = (float)plane_d[j];
+            }
+            pointwise_multiply(window_function, block, pixels_per_block);
+            aom_noise_tx_forward(tx, block);
+            aom_noise_tx_filter(tx, noise_psd[c]);
+            aom_noise_tx_inverse(tx, block);
+
+            // Apply window function to the plane approximation (we will apply
+            // it to the sum of plane + block when composing the results).
+            pointwise_multiply(window_function, plane, pixels_per_block);
+
+            for (int y = 0; y < (block_size >> chroma_sub_h); ++y) {
+              const int y_result =
+                  y + (by + 1) * (block_size >> chroma_sub_h) + offsy;
+              for (int x = 0; x < (block_size >> chroma_sub_w); ++x) {
+                const int x_result =
+                    x + (bx + 1) * (block_size >> chroma_sub_w) + offsx;
+                result[y_result * result_stride + x_result] +=
+                    (block[y * (block_size >> chroma_sub_w) + x] +
+                     plane[y * (block_size >> chroma_sub_w) + x]) *
+                    window_function[y * (block_size >> chroma_sub_w) + x];
+              }
+            }
+          }
+        }
+      }
+    }
+    if (use_highbd) {
+      dither_and_quantize_highbd(result, result_stride, (uint16_t *)denoised[c],
+                                 w, h, stride[c], chroma_sub_w, chroma_sub_h,
+                                 block_size, kBlockNormalization);
+    } else {
+      dither_and_quantize_lowbd(result, result_stride, denoised[c], w, h,
+                                stride[c], chroma_sub_w, chroma_sub_h,
+                                block_size, kBlockNormalization);
+    }
+  }
+  aom_free(result);
+  aom_free(plane);
+  aom_free(block);
+  aom_free(plane_d);
+  aom_free(block_d);
+  aom_free(window_full);
+
+  aom_noise_tx_free(tx_full);
+
+  aom_flat_block_finder_free(&block_finder_full);
+  if (chroma_sub[0] != 0) {
+    aom_flat_block_finder_free(&block_finder_chroma);
+    aom_free(window_chroma);
+    aom_noise_tx_free(tx_chroma);
+  }
+  return init_success;
+}
+
+struct aom_denoise_and_model_t {
+  int block_size;
+  int bit_depth;
+  float noise_level;
+
+  // Size of current denoised buffer and flat_block buffer
+  int width;
+  int height;
+  int y_stride;
+  int uv_stride;
+  int num_blocks_w;
+  int num_blocks_h;
+
+  // Buffers for image and noise_psd allocated on the fly
+  float *noise_psd[3];
+  uint8_t *denoised[3];
+  uint8_t *flat_blocks;
+
+  aom_flat_block_finder_t flat_block_finder;
+  aom_noise_model_t noise_model;
+};
+
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level) {
+  struct aom_denoise_and_model_t *ctx =
+      (struct aom_denoise_and_model_t *)aom_malloc(
+          sizeof(struct aom_denoise_and_model_t));
+  if (!ctx) {
+    fprintf(stderr, "Unable to allocate denoise_and_model struct\n");
+    return NULL;
+  }
+  memset(ctx, 0, sizeof(*ctx));
+
+  ctx->block_size = block_size;
+  ctx->noise_level = noise_level;
+  ctx->bit_depth = bit_depth;
+
+  ctx->noise_psd[0] =
+      (float *)aom_malloc(sizeof(*ctx->noise_psd[0]) * block_size * block_size);
+  ctx->noise_psd[1] =
+      (float *)aom_malloc(sizeof(*ctx->noise_psd[1]) * block_size * block_size);
+  ctx->noise_psd[2] =
+      (float *)aom_malloc(sizeof(*ctx->noise_psd[2]) * block_size * block_size);
+  if (!ctx->noise_psd[0] || !ctx->noise_psd[1] || !ctx->noise_psd[2]) {
+    fprintf(stderr, "Unable to allocate noise PSD buffers\n");
+    aom_denoise_and_model_free(ctx);
+    return NULL;
+  }
+  return ctx;
+}
+
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *ctx) {
+  aom_free(ctx->flat_blocks);
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    aom_free(ctx->noise_psd[i]);
+  }
+  aom_noise_model_free(&ctx->noise_model);
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  aom_free(ctx);
+}
+
+static int denoise_and_model_realloc_if_necessary(
+    struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) {
+  if (ctx->width == sd->y_width && ctx->height == sd->y_height &&
+      ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride)
+    return 1;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  const int block_size = ctx->block_size;
+
+  ctx->width = sd->y_width;
+  ctx->height = sd->y_height;
+  ctx->y_stride = sd->y_stride;
+  ctx->uv_stride = sd->uv_stride;
+
+  for (int i = 0; i < 3; ++i) {
+    aom_free(ctx->denoised[i]);
+    ctx->denoised[i] = NULL;
+  }
+  aom_free(ctx->flat_blocks);
+  ctx->flat_blocks = NULL;
+
+  ctx->denoised[0] =
+      (uint8_t *)aom_malloc((sd->y_stride * sd->y_height) << use_highbd);
+  ctx->denoised[1] =
+      (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  ctx->denoised[2] =
+      (uint8_t *)aom_malloc((sd->uv_stride * sd->uv_height) << use_highbd);
+  if (!ctx->denoised[0] || !ctx->denoised[1] || !ctx->denoised[2]) {
+    fprintf(stderr, "Unable to allocate denoise buffers\n");
+    return 0;
+  }
+  ctx->num_blocks_w = (sd->y_width + ctx->block_size - 1) / ctx->block_size;
+  ctx->num_blocks_h = (sd->y_height + ctx->block_size - 1) / ctx->block_size;
+  ctx->flat_blocks =
+      (uint8_t *)aom_malloc(ctx->num_blocks_w * ctx->num_blocks_h);
+  if (!ctx->flat_blocks) {
+    fprintf(stderr, "Unable to allocate flat_blocks buffer\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_free(&ctx->flat_block_finder);
+  if (!aom_flat_block_finder_init(&ctx->flat_block_finder, ctx->block_size,
+                                  ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to init flat block finder\n");
+    return 0;
+  }
+
+  const aom_noise_model_params_t params = { AOM_NOISE_SHAPE_SQUARE, 3,
+                                            ctx->bit_depth, use_highbd };
+  aom_noise_model_free(&ctx->noise_model);
+  if (!aom_noise_model_init(&ctx->noise_model, params)) {
+    fprintf(stderr, "Unable to init noise model\n");
+    return 0;
+  }
+
+  // Simply use a flat PSD (although we could use the flat blocks to estimate
+  // PSD) those to estimate an actual noise PSD)
+  const float y_noise_level =
+      aom_noise_psd_get_default_value(ctx->block_size, ctx->noise_level);
+  const float uv_noise_level = aom_noise_psd_get_default_value(
+      ctx->block_size >> sd->subsampling_x, ctx->noise_level);
+  for (int i = 0; i < block_size * block_size; ++i) {
+    ctx->noise_psd[0][i] = y_noise_level;
+    ctx->noise_psd[1][i] = ctx->noise_psd[2][i] = uv_noise_level;
+  }
+  return 1;
+}
+
+// TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer
+// are null pointers) correctly.
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *sd,
+                              aom_film_grain_t *film_grain, int apply_denoise) {
+  const int block_size = ctx->block_size;
+  const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0;
+  uint8_t *raw_data[3] = {
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->y_buffer) : sd->y_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->u_buffer) : sd->u_buffer,
+    use_highbd ? (uint8_t *)CONVERT_TO_SHORTPTR(sd->v_buffer) : sd->v_buffer,
+  };
+  const uint8_t *const data[3] = { raw_data[0], raw_data[1], raw_data[2] };
+  int strides[3] = { sd->y_stride, sd->uv_stride, sd->uv_stride };
+  int chroma_sub_log2[2] = { sd->subsampling_x, sd->subsampling_y };
+
+  if (!denoise_and_model_realloc_if_necessary(ctx, sd)) {
+    fprintf(stderr, "Unable to realloc buffers\n");
+    return 0;
+  }
+
+  aom_flat_block_finder_run(&ctx->flat_block_finder, data[0], sd->y_width,
+                            sd->y_height, strides[0], ctx->flat_blocks);
+
+  if (!aom_wiener_denoise_2d(data, ctx->denoised, sd->y_width, sd->y_height,
+                             strides, chroma_sub_log2, ctx->noise_psd,
+                             block_size, ctx->bit_depth, use_highbd)) {
+    fprintf(stderr, "Unable to denoise image\n");
+    return 0;
+  }
+
+  const aom_noise_status_t status = aom_noise_model_update(
+      &ctx->noise_model, data, (const uint8_t *const *)ctx->denoised,
+      sd->y_width, sd->y_height, strides, chroma_sub_log2, ctx->flat_blocks,
+      block_size);
+  int have_noise_estimate = 0;
+  if (status == AOM_NOISE_STATUS_OK) {
+    have_noise_estimate = 1;
+  } else if (status == AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE) {
+    aom_noise_model_save_latest(&ctx->noise_model);
+    have_noise_estimate = 1;
+  } else {
+    // Unable to update noise model; proceed if we have a previous estimate.
+    have_noise_estimate =
+        (ctx->noise_model.combined_state[0].strength_solver.num_equations > 0);
+  }
+
+  film_grain->apply_grain = 0;
+  if (have_noise_estimate) {
+    if (!aom_noise_model_get_grain_parameters(&ctx->noise_model, film_grain)) {
+      fprintf(stderr, "Unable to get grain parameters.\n");
+      return 0;
+    }
+    if (!film_grain->random_seed) {
+      film_grain->random_seed = 7391;
+    }
+    if (apply_denoise) {
+      memcpy(raw_data[0], ctx->denoised[0],
+             (strides[0] * sd->y_height) << use_highbd);
+      if (!sd->monochrome) {
+        memcpy(raw_data[1], ctx->denoised[1],
+               (strides[1] * sd->uv_height) << use_highbd);
+        memcpy(raw_data[2], ctx->denoised[2],
+               (strides[2] * sd->uv_height) << use_highbd);
+      }
+    }
+  }
+  return 1;
+}
diff --git a/third_party/aom/aom_dsp/noise_model.h b/third_party/aom/aom_dsp/noise_model.h
new file mode 100644
index 0000000000..8228aeacfc
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_model.h
@@ -0,0 +1,328 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_MODEL_H_
+#define AOM_AOM_DSP_NOISE_MODEL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#include <stdint.h>
+#include "aom_dsp/grain_params.h"
+#include "aom_ports/mem.h"
+#include "aom_scale/yv12config.h"
+
+/*!\brief Wrapper of data required to represent linear system of eqns and soln.
+ */
+typedef struct {
+  double *A;
+  double *b;
+  double *x;
+  int n;
+} aom_equation_system_t;
+
+/*!\brief Representation of a piecewise linear curve
+ *
+ * Holds n points as (x, y) pairs, that store the curve.
+ */
+typedef struct {
+  double (*points)[2];
+  int num_points;
+} aom_noise_strength_lut_t;
+
+/*!\brief Init the noise strength lut with the given number of points*/
+int aom_noise_strength_lut_init(aom_noise_strength_lut_t *lut, int num_points);
+
+/*!\brief Frees the noise strength lut. */
+void aom_noise_strength_lut_free(aom_noise_strength_lut_t *lut);
+
+/*!\brief Evaluate the lut at the point x.
+ *
+ * \param[in] lut  The lut data.
+ * \param[in] x    The coordinate to evaluate the lut.
+ */
+double aom_noise_strength_lut_eval(const aom_noise_strength_lut_t *lut,
+                                   double x);
+
+/*!\brief Helper struct to model noise strength as a function of intensity.
+ *
+ * Internally, this structure holds a representation of a linear system
+ * of equations that models noise strength (standard deviation) as a
+ * function of intensity. The mapping is initially stored using a
+ * piecewise representation with evenly spaced bins that cover the entire
+ * domain from [min_intensity, max_intensity]. Each observation (x,y) gives a
+ * constraint of the form:
+ *   y_{i} (1 - a) + y_{i+1} a = y
+ * where y_{i} is the value of bin i and x_{i} <= x <= x_{i+1} and
+ * a = x/(x_{i+1} - x{i}). The equation system holds the corresponding
+ * normal equations.
+ *
+ * As there may be missing data, the solution is regularized to get a
+ * complete set of values for the bins. A reduced representation after
+ * solving can be obtained by getting the corresponding noise_strength_lut_t.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  double min_intensity;
+  double max_intensity;
+  int num_bins;
+  int num_equations;
+  double total;
+} aom_noise_strength_solver_t;
+
+/*!\brief Initializes the noise solver with the given number of bins.
+ *
+ * Returns 0 if initialization fails.
+ *
+ * \param[in]  solver    The noise solver to be initialized.
+ * \param[in]  num_bins  Number of bins to use in the internal representation.
+ * \param[in]  bit_depth The bit depth used to derive {min,max}_intensity.
+ */
+int aom_noise_strength_solver_init(aom_noise_strength_solver_t *solver,
+                                   int num_bins, int bit_depth);
+void aom_noise_strength_solver_free(aom_noise_strength_solver_t *solver);
+
+/*!\brief Gets the x coordinate of bin i.
+ *
+ * \param[in]  i  The bin whose coordinate to query.
+ */
+double aom_noise_strength_solver_get_center(
+    const aom_noise_strength_solver_t *solver, int i);
+
+/*!\brief Add an observation of the block mean intensity to its noise strength.
+ *
+ * \param[in]  block_mean  The average block intensity,
+ * \param[in]  noise_std   The observed noise strength.
+ */
+void aom_noise_strength_solver_add_measurement(
+    aom_noise_strength_solver_t *solver, double block_mean, double noise_std);
+
+/*!\brief Solves the current set of equations for the noise strength. */
+int aom_noise_strength_solver_solve(aom_noise_strength_solver_t *solver);
+
+/*!\brief Fits a reduced piecewise linear lut to the internal solution
+ *
+ * \param[in] max_num_points  The maximum number of output points
+ * \param[out] lut  The output piecewise linear lut.
+ */
+int aom_noise_strength_solver_fit_piecewise(
+    const aom_noise_strength_solver_t *solver, int max_num_points,
+    aom_noise_strength_lut_t *lut);
+
+/*!\brief Helper for holding precomputed data for finding flat blocks.
+ *
+ * Internally a block is modeled with a low-order polynomial model. A
+ * planar model would be a bunch of equations like:
+ * <[y_i x_i 1], [a_1, a_2, a_3]>  = b_i
+ * for each point in the block. The system matrix A with row i as [y_i x_i 1]
+ * is maintained as is the inverse, inv(A'*A), so that the plane parameters
+ * can be fit for each block.
+ */
+typedef struct {
+  double *AtA_inv;
+  double *A;
+  int num_params;  // The number of parameters used for internal low-order model
+  int block_size;  // The block size the finder was initialized with
+  double normalization;  // Normalization factor (1 / (2^(bit_depth) - 1))
+  int use_highbd;        // Whether input data should be interpreted as uint16
+} aom_flat_block_finder_t;
+
+/*!\brief Init the block_finder with the given block size, bit_depth */
+int aom_flat_block_finder_init(aom_flat_block_finder_t *block_finder,
+                               int block_size, int bit_depth, int use_highbd);
+void aom_flat_block_finder_free(aom_flat_block_finder_t *block_finder);
+
+/*!\brief Helper to extract a block and low order "planar" model. */
+void aom_flat_block_finder_extract_block(
+    const aom_flat_block_finder_t *block_finder, const uint8_t *const data,
+    int w, int h, int stride, int offsx, int offsy, double *plane,
+    double *block);
+
+/*!\brief Runs the flat block finder on the input data.
+ *
+ * Find flat blocks in the input image data. Returns a map of
+ * flat_blocks, where the value of flat_blocks map will be non-zero
+ * when a block is determined to be flat. A higher value indicates a bigger
+ * confidence in the decision.
+ */
+int aom_flat_block_finder_run(const aom_flat_block_finder_t *block_finder,
+                              const uint8_t *const data, int w, int h,
+                              int stride, uint8_t *flat_blocks);
+
+// The noise shape indicates the allowed coefficients in the AR model.
+enum {
+  AOM_NOISE_SHAPE_DIAMOND = 0,
+  AOM_NOISE_SHAPE_SQUARE = 1
+} UENUM1BYTE(aom_noise_shape);
+
+// The parameters of the noise model include the shape type, lag, the
+// bit depth of the input images provided, and whether the input images
+// will be using uint16 (or uint8) representation.
+typedef struct {
+  aom_noise_shape shape;
+  int lag;
+  int bit_depth;
+  int use_highbd;
+} aom_noise_model_params_t;
+
+/*!\brief State of a noise model estimate for a single channel.
+ *
+ * This contains a system of equations that can be used to solve
+ * for the auto-regressive coefficients as well as a noise strength
+ * solver that can be used to model noise strength as a function of
+ * intensity.
+ */
+typedef struct {
+  aom_equation_system_t eqns;
+  aom_noise_strength_solver_t strength_solver;
+  int num_observations;  // The number of observations in the eqn system
+  double ar_gain;        // The gain of the current AR filter
+} aom_noise_state_t;
+
+/*!\brief Complete model of noise for a planar video
+ *
+ * This includes a noise model for the latest frame and an aggregated
+ * estimate over all previous frames that had similar parameters.
+ */
+typedef struct {
+  aom_noise_model_params_t params;
+  aom_noise_state_t combined_state[3];  // Combined state per channel
+  aom_noise_state_t latest_state[3];    // Latest state per channel
+  int (*coords)[2];  // Offsets (x,y) of the coefficient samples
+  int n;             // Number of parameters (size of coords)
+  int bit_depth;
+} aom_noise_model_t;
+
+/*!\brief Result of a noise model update. */
+enum {
+  AOM_NOISE_STATUS_OK = 0,
+  AOM_NOISE_STATUS_INVALID_ARGUMENT,
+  AOM_NOISE_STATUS_INSUFFICIENT_FLAT_BLOCKS,
+  AOM_NOISE_STATUS_DIFFERENT_NOISE_TYPE,
+  AOM_NOISE_STATUS_INTERNAL_ERROR,
+} UENUM1BYTE(aom_noise_status_t);
+
+/*!\brief Initializes a noise model with the given parameters.
+ *
+ * Returns 0 on failure.
+ */
+int aom_noise_model_init(aom_noise_model_t *model,
+                         const aom_noise_model_params_t params);
+void aom_noise_model_free(aom_noise_model_t *model);
+
+/*!\brief Updates the noise model with a new frame observation.
+ *
+ * Updates the noise model with measurements from the given input frame and a
+ * denoised variant of it. Noise is sampled from flat blocks using the flat
+ * block map.
+ *
+ * Returns a noise_status indicating if the update was successful. If the
+ * Update was successful, the combined_state is updated with measurements from
+ * the provided frame. If status is OK or DIFFERENT_NOISE_TYPE, the latest noise
+ * state will be updated with measurements from the provided frame.
+ *
+ * \param[in,out] noise_model     The noise model to be updated
+ * \param[in]     data            Raw frame data
+ * \param[in]     denoised        Denoised frame data.
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     strides         Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     flat_blocks     A map to blocks that have been determined flat
+ * \param[in]     block_size      The size of blocks.
+ */
+aom_noise_status_t aom_noise_model_update(
+    aom_noise_model_t *const noise_model, const uint8_t *const data[3],
+    const uint8_t *const denoised[3], int w, int h, int strides[3],
+    int chroma_sub_log2[2], const uint8_t *const flat_blocks, int block_size);
+
+/*\brief Save the "latest" estimate into the "combined" estimate.
+ *
+ * This is meant to be called when the noise modeling detected a change
+ * in parameters (or for example, if a user wanted to reset estimation at
+ * a shot boundary).
+ */
+void aom_noise_model_save_latest(aom_noise_model_t *noise_model);
+
+/*!\brief Converts the noise_model parameters to the corresponding
+ *    grain_parameters.
+ *
+ * The noise structs in this file are suitable for estimation (e.g., using
+ * floats), but the grain parameters in the bitstream are quantized. This
+ * function does the conversion by selecting the correct quantization levels.
+ */
+int aom_noise_model_get_grain_parameters(aom_noise_model_t *const noise_model,
+                                         aom_film_grain_t *film_grain);
+
+/*!\brief Perform a Wiener filter denoising in 2D using the provided noise psd.
+ *
+ * \param[in]     data            Raw frame data
+ * \param[out]    denoised        Denoised frame data
+ * \param[in]     w               Frame width
+ * \param[in]     h               Frame height
+ * \param[in]     stride          Stride of the planes
+ * \param[in]     chroma_sub_log2 Chroma subsampling for planes != 0.
+ * \param[in]     noise_psd       The power spectral density of the noise
+ * \param[in]     block_size      The size of blocks
+ * \param[in]     bit_depth       Bit depth of the image
+ * \param[in]     use_highbd      If true, uint8 pointers are interpreted as
+ *                                uint16 and stride is measured in uint16.
+ *                                This must be true when bit_depth >= 10.
+ */
+int aom_wiener_denoise_2d(const uint8_t *const data[3], uint8_t *denoised[3],
+                          int w, int h, int stride[3], int chroma_sub_log2[2],
+                          float *noise_psd[3], int block_size, int bit_depth,
+                          int use_highbd);
+
+struct aom_denoise_and_model_t;
+
+/*!\brief Denoise the buffer and model the residual noise.
+ *
+ * This is meant to be called sequentially on input frames. The input buffer
+ * is denoised and the residual noise is modelled. The current noise estimate
+ * is populated in film_grain. Returns true on success. The grain.apply_grain
+ * parameter will be true when the input buffer was successfully denoised and
+ * grain was modelled. Returns false on error.
+ *
+ * \param[in]     ctx           Struct allocated with
+ *                              aom_denoise_and_model_alloc that holds some
+ *                              buffers for denoising and the current noise
+ *                              estimate.
+ * \param[in,out] buf           The raw input buffer to be denoised.
+ * \param[out]    grain         Output film grain parameters
+ * \param[in]     apply_denoise Whether or not to apply the denoising to the
+ *                              frame that will be encoded
+ */
+int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx,
+                              YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain,
+                              int apply_denoise);
+
+/*!\brief Allocates a context that can be used for denoising and noise modeling.
+ *
+ * \param[in]  bit_depth   Bit depth of buffers this will be run on.
+ * \param[in]  block_size  Block size for noise modeling and flat block
+ *                         estimation
+ * \param[in]  noise_level The noise_level (2.5 for moderate noise, and 5 for
+ *                         higher levels of noise)
+ */
+struct aom_denoise_and_model_t *aom_denoise_and_model_alloc(int bit_depth,
+                                                            int block_size,
+                                                            float noise_level);
+
+/*!\brief Frees the denoise context allocated with aom_denoise_and_model_alloc
+ */
+void aom_denoise_and_model_free(struct aom_denoise_and_model_t *denoise_model);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // AOM_AOM_DSP_NOISE_MODEL_H_
diff --git a/third_party/aom/aom_dsp/noise_util.c b/third_party/aom/aom_dsp/noise_util.c
new file mode 100644
index 0000000000..3ded8cb099
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.c
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <math.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/noise_util.h"
+#include "aom_dsp/fft_common.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+float aom_noise_psd_get_default_value(int block_size, float factor) {
+  return (factor * factor / 10000) * block_size * block_size / 8;
+}
+
+// Internal representation of noise transform. It keeps track of the
+// transformed data and a temporary working buffer to use during the
+// transform.
+struct aom_noise_tx_t {
+  float *tx_block;
+  float *temp;
+  int block_size;
+  void (*fft)(const float *, float *, float *);
+  void (*ifft)(const float *, float *, float *);
+};
+
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size) {
+  struct aom_noise_tx_t *noise_tx =
+      (struct aom_noise_tx_t *)aom_malloc(sizeof(struct aom_noise_tx_t));
+  if (!noise_tx) return NULL;
+  memset(noise_tx, 0, sizeof(*noise_tx));
+  switch (block_size) {
+    case 2:
+      noise_tx->fft = aom_fft2x2_float;
+      noise_tx->ifft = aom_ifft2x2_float;
+      break;
+    case 4:
+      noise_tx->fft = aom_fft4x4_float;
+      noise_tx->ifft = aom_ifft4x4_float;
+      break;
+    case 8:
+      noise_tx->fft = aom_fft8x8_float;
+      noise_tx->ifft = aom_ifft8x8_float;
+      break;
+    case 16:
+      noise_tx->fft = aom_fft16x16_float;
+      noise_tx->ifft = aom_ifft16x16_float;
+      break;
+    case 32:
+      noise_tx->fft = aom_fft32x32_float;
+      noise_tx->ifft = aom_ifft32x32_float;
+      break;
+    default:
+      aom_free(noise_tx);
+      fprintf(stderr, "Unsupported block size %d\n", block_size);
+      return NULL;
+  }
+  noise_tx->block_size = block_size;
+  noise_tx->tx_block = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  noise_tx->temp = (float *)aom_memalign(
+      32, 2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  if (!noise_tx->tx_block || !noise_tx->temp) {
+    aom_noise_tx_free(noise_tx);
+    return NULL;
+  }
+  // Clear the buffers up front. Some outputs of the forward transform are
+  // real only (the imaginary component will never be touched)
+  memset(noise_tx->tx_block, 0,
+         2 * sizeof(*noise_tx->tx_block) * block_size * block_size);
+  memset(noise_tx->temp, 0,
+         2 * sizeof(*noise_tx->temp) * block_size * block_size);
+  return noise_tx;
+}
+
+void aom_noise_tx_forward(struct aom_noise_tx_t *noise_tx, const float *data) {
+  noise_tx->fft(data, noise_tx->temp, noise_tx->tx_block);
+}
+
+void aom_noise_tx_filter(struct aom_noise_tx_t *noise_tx, const float *psd) {
+  const int block_size = noise_tx->block_size;
+  const float kBeta = 1.1f;
+  const float kEps = 1e-6f;
+  for (int y = 0; y < block_size; ++y) {
+    for (int x = 0; x < block_size; ++x) {
+      int i = y * block_size + x;
+      float *c = noise_tx->tx_block + 2 * i;
+      const float c0 = AOMMAX((float)fabs(c[0]), 1e-8f);
+      const float c1 = AOMMAX((float)fabs(c[1]), 1e-8f);
+      const float p = c0 * c0 + c1 * c1;
+      if (p > kBeta * psd[i] && p > 1e-6) {
+        noise_tx->tx_block[2 * i + 0] *= (p - psd[i]) / AOMMAX(p, kEps);
+        noise_tx->tx_block[2 * i + 1] *= (p - psd[i]) / AOMMAX(p, kEps);
+      } else {
+        noise_tx->tx_block[2 * i + 0] *= (kBeta - 1.0f) / kBeta;
+        noise_tx->tx_block[2 * i + 1] *= (kBeta - 1.0f) / kBeta;
+      }
+    }
+  }
+}
+
+void aom_noise_tx_inverse(struct aom_noise_tx_t *noise_tx, float *data) {
+  const int n = noise_tx->block_size * noise_tx->block_size;
+  noise_tx->ifft(noise_tx->tx_block, noise_tx->temp, data);
+  for (int i = 0; i < n; ++i) {
+    data[i] /= n;
+  }
+}
+
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *noise_tx,
+                             float *psd) {
+  const int block_size = noise_tx->block_size;
+  for (int yb = 0; yb < block_size; ++yb) {
+    for (int xb = 0; xb <= block_size / 2; ++xb) {
+      float *c = noise_tx->tx_block + 2 * (yb * block_size + xb);
+      psd[yb * block_size + xb] += c[0] * c[0] + c[1] * c[1];
+    }
+  }
+}
+
+void aom_noise_tx_free(struct aom_noise_tx_t *noise_tx) {
+  if (!noise_tx) return;
+  aom_free(noise_tx->tx_block);
+  aom_free(noise_tx->temp);
+  aom_free(noise_tx);
+}
+
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n) {
+  double c = 0;
+  double a_len = 0;
+  double b_len = 0;
+  for (int i = 0; i < n; ++i) {
+    a_len += a[i] * a[i];
+    b_len += b[i] * b[i];
+    c += a[i] * b[i];
+  }
+  return c / (sqrt(a_len) * sqrt(b_len));
+}
+
+int aom_noise_data_validate(const double *data, int w, int h) {
+  const double kVarianceThreshold = 2;
+  const double kMeanThreshold = 2;
+
+  int x = 0, y = 0;
+  int ret_value = 1;
+  double var = 0, mean = 0;
+  double *mean_x, *mean_y, *var_x, *var_y;
+
+  // Check that noise variance is not increasing in x or y
+  // and that the data is zero mean.
+  mean_x = (double *)aom_calloc(w, sizeof(*mean_x));
+  var_x = (double *)aom_calloc(w, sizeof(*var_x));
+  mean_y = (double *)aom_calloc(h, sizeof(*mean_x));
+  var_y = (double *)aom_calloc(h, sizeof(*var_y));
+  if (!(mean_x && var_x && mean_y && var_y)) {
+    aom_free(mean_x);
+    aom_free(mean_y);
+    aom_free(var_x);
+    aom_free(var_y);
+    return 0;
+  }
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      const double d = data[y * w + x];
+      var_x[x] += d * d;
+      var_y[y] += d * d;
+      mean_x[x] += d;
+      mean_y[y] += d;
+      var += d * d;
+      mean += d;
+    }
+  }
+  mean /= (w * h);
+  var = var / (w * h) - mean * mean;
+
+  for (y = 0; y < h; ++y) {
+    mean_y[y] /= h;
+    var_y[y] = var_y[y] / h - mean_y[y] * mean_y[y];
+    if (fabs(var_y[y] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_y[y], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_y[y] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_y[y], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  for (x = 0; x < w; ++x) {
+    mean_x[x] /= w;
+    var_x[x] = var_x[x] / w - mean_x[x] * mean_x[x];
+    if (fabs(var_x[x] - var) >= kVarianceThreshold) {
+      fprintf(stderr, "Variance distance too large %f %f\n", var_x[x], var);
+      ret_value = 0;
+      break;
+    }
+    if (fabs(mean_x[x] - mean) >= kMeanThreshold) {
+      fprintf(stderr, "Mean distance too large %f %f\n", mean_x[x], mean);
+      ret_value = 0;
+      break;
+    }
+  }
+
+  aom_free(mean_x);
+  aom_free(mean_y);
+  aom_free(var_x);
+  aom_free(var_y);
+
+  return ret_value;
+}
diff --git a/third_party/aom/aom_dsp/noise_util.h b/third_party/aom/aom_dsp/noise_util.h
new file mode 100644
index 0000000000..2284a171a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/noise_util.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_NOISE_UTIL_H_
+#define AOM_AOM_DSP_NOISE_UTIL_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// aom_noise_tx_t is an abstraction of a transform that is used for denoising.
+// It is meant to be lightweight and does hold the transformed data (as
+// the user should not be manipulating the transformed data directly).
+struct aom_noise_tx_t;
+
+// Allocates and returns a aom_noise_tx_t useful for denoising the given
+// block_size. The resulting aom_noise_tx_t should be free'd with
+// aom_noise_tx_free.
+struct aom_noise_tx_t *aom_noise_tx_malloc(int block_size);
+void aom_noise_tx_free(struct aom_noise_tx_t *aom_noise_tx);
+
+// Transforms the internal data and holds it in the aom_noise_tx's internal
+// buffer. For compatibility with existing SIMD implementations, "data" must
+// be 32-byte aligned.
+void aom_noise_tx_forward(struct aom_noise_tx_t *aom_noise_tx,
+                          const float *data);
+
+// Filters aom_noise_tx's internal data using the provided noise power spectral
+// density. The PSD must be at least block_size * block_size and should be
+// populated with a constant or via estimates taken from
+// aom_noise_tx_add_energy.
+void aom_noise_tx_filter(struct aom_noise_tx_t *aom_noise_tx, const float *psd);
+
+// Performs an inverse transform using the internal transform data.
+// For compatibility with existing SIMD implementations, "data" must be 32-byte
+// aligned.
+void aom_noise_tx_inverse(struct aom_noise_tx_t *aom_noise_tx, float *data);
+
+// Aggregates the power of the buffered transform data into the psd buffer.
+void aom_noise_tx_add_energy(const struct aom_noise_tx_t *aom_noise_tx,
+                             float *psd);
+
+// Returns a default value suitable for denosing a transform of the given
+// block_size. The noise "factor" determines the strength of the noise to
+// be removed. A value of about 2.5 can be used for moderate denoising,
+// where a value of 5.0 can be used for a high level of denoising.
+float aom_noise_psd_get_default_value(int block_size, float factor);
+
+// Computes normalized cross correlation of two vectors a and b of length n.
+double aom_normalized_cross_correlation(const double *a, const double *b,
+                                        int n);
+
+// Validates the correlated noise in the data buffer of size (w, h).
+int aom_noise_data_validate(const double *data, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // AOM_AOM_DSP_NOISE_UTIL_H_
diff --git a/third_party/aom/aom_dsp/odintrin.c b/third_party/aom/aom_dsp/odintrin.c
new file mode 100644
index 0000000000..eb6d8d8771
--- /dev/null
+++ b/third_party/aom/aom_dsp/odintrin.c
@@ -0,0 +1,541 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#include "aom_dsp/odintrin.h"
+
+/*Constants for use with OD_DIVU_SMALL().
+  See \cite{Rob05} for details on computing these constants.
+  @INPROCEEDINGS{Rob05,
+    author="Arch D. Robison",
+    title="{N}-bit Unsigned Division via {N}-bit Multiply-Add",
+    booktitle="Proc. of the 17th IEEE Symposium on Computer Arithmetic
+     (ARITH'05)",
+    pages="131--139",
+    address="Cape Cod, MA",
+    month=Jun,
+    year=2005
+  }*/
+uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2] = {
+  { 0xFFFFFFFF, 0xFFFFFFFF }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xAAAAAAAB, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xCCCCCCCD, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x92492492, 0x92492492 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xE38E38E4, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xBA2E8BA3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0x9D89D89E, 0 },          { 0x92492492, 0x92492492 },
+  { 0x88888889, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF0F0F0F1, 0 },          { 0xE38E38E4, 0 },
+  { 0xD79435E5, 0xD79435E5 }, { 0xCCCCCCCD, 0 },
+  { 0xC30C30C3, 0xC30C30C3 }, { 0xBA2E8BA3, 0 },
+  { 0xB21642C9, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA3D70A3E, 0 },          { 0x9D89D89E, 0 },
+  { 0x97B425ED, 0x97B425ED }, { 0x92492492, 0x92492492 },
+  { 0x8D3DCB09, 0 },          { 0x88888889, 0 },
+  { 0x84210842, 0x84210842 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xF83E0F84, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEA0EA0EA, 0xEA0EA0EA }, { 0xE38E38E4, 0 },
+  { 0xDD67C8A6, 0xDD67C8A6 }, { 0xD79435E5, 0xD79435E5 },
+  { 0xD20D20D2, 0xD20D20D2 }, { 0xCCCCCCCD, 0 },
+  { 0xC7CE0C7D, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xBE82FA0C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB60B60B6, 0xB60B60B6 }, { 0xB21642C9, 0 },
+  { 0xAE4C415D, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA72F053A, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA0A0A0A1, 0 },          { 0x9D89D89E, 0 },
+  { 0x9A90E7D9, 0x9A90E7D9 }, { 0x97B425ED, 0x97B425ED },
+  { 0x94F2094F, 0x94F2094F }, { 0x92492492, 0x92492492 },
+  { 0x8FB823EE, 0x8FB823EE }, { 0x8D3DCB09, 0 },
+  { 0x8AD8F2FC, 0 },          { 0x88888889, 0 },
+  { 0x864B8A7E, 0 },          { 0x84210842, 0x84210842 },
+  { 0x82082082, 0x82082082 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFC0FC0FD, 0 },          { 0xF83E0F84, 0 },
+  { 0xF4898D60, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xED7303B6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE6C2B449, 0 },          { 0xE38E38E4, 0 },
+  { 0xE070381C, 0xE070381C }, { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDA740DA8, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD4C77B04, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xCF6474A9, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCA4587E7, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC565C87C, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC0C0C0C1, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBC52640C, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB81702E1, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB40B40B4, 0xB40B40B4 }, { 0xB21642C9, 0 },
+  { 0xB02C0B03, 0 },          { 0xAE4C415D, 0 },
+  { 0xAC769184, 0xAC769184 }, { 0xAAAAAAAB, 0 },
+  { 0xA8E83F57, 0xA8E83F57 }, { 0xA72F053A, 0 },
+  { 0xA57EB503, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA237C32B, 0xA237C32B }, { 0xA0A0A0A1, 0 },
+  { 0x9F1165E7, 0x9F1165E7 }, { 0x9D89D89E, 0 },
+  { 0x9C09C09C, 0x9C09C09C }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x991F1A51, 0x991F1A51 }, { 0x97B425ED, 0x97B425ED },
+  { 0x964FDA6C, 0x964FDA6C }, { 0x94F2094F, 0x94F2094F },
+  { 0x939A85C4, 0x939A85C4 }, { 0x92492492, 0x92492492 },
+  { 0x90FDBC09, 0x90FDBC09 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8E78356D, 0x8E78356D }, { 0x8D3DCB09, 0 },
+  { 0x8C08C08C, 0x8C08C08C }, { 0x8AD8F2FC, 0 },
+  { 0x89AE408A, 0 },          { 0x88888889, 0 },
+  { 0x8767AB5F, 0x8767AB5F }, { 0x864B8A7E, 0 },
+  { 0x85340853, 0x85340853 }, { 0x84210842, 0x84210842 },
+  { 0x83126E98, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81020408, 0x81020408 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFE03F810, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFA232CF3, 0 },          { 0xF83E0F84, 0 },
+  { 0xF6603D99, 0 },          { 0xF4898D60, 0 },
+  { 0xF2B9D649, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xEF2EB720, 0 },          { 0xED7303B6, 0 },
+  { 0xEBBDB2A6, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE865AC7C, 0 },          { 0xE6C2B449, 0 },
+  { 0xE525982B, 0 },          { 0xE38E38E4, 0 },
+  { 0xE1FC780F, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDEE95C4D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDBEB61EF, 0 },          { 0xDA740DA8, 0 },
+  { 0xD901B204, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD62B80D7, 0 },          { 0xD4C77B04, 0 },
+  { 0xD3680D37, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD0B69FCC, 0 },          { 0xCF6474A9, 0 },
+  { 0xCE168A77, 0xCE168A77 }, { 0xCCCCCCCD, 0 },
+  { 0xCB8727C1, 0 },          { 0xCA4587E7, 0 },
+  { 0xC907DA4F, 0 },          { 0xC7CE0C7D, 0 },
+  { 0xC6980C6A, 0 },          { 0xC565C87C, 0 },
+  { 0xC4372F86, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC1E4BBD6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xBFA02FE8, 0xBFA02FE8 }, { 0xBE82FA0C, 0 },
+  { 0xBD691047, 0xBD691047 }, { 0xBC52640C, 0 },
+  { 0xBB3EE722, 0 },          { 0xBA2E8BA3, 0 },
+  { 0xB92143FA, 0xB92143FA }, { 0xB81702E1, 0 },
+  { 0xB70FBB5A, 0xB70FBB5A }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB509E68B, 0 },          { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB30F6353, 0 },          { 0xB21642C9, 0 },
+  { 0xB11FD3B8, 0xB11FD3B8 }, { 0xB02C0B03, 0 },
+  { 0xAF3ADDC7, 0 },          { 0xAE4C415D, 0 },
+  { 0xAD602B58, 0xAD602B58 }, { 0xAC769184, 0xAC769184 },
+  { 0xAB8F69E3, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xA9C84A48, 0 },          { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA80A80A8, 0xA80A80A8 }, { 0xA72F053A, 0 },
+  { 0xA655C439, 0xA655C439 }, { 0xA57EB503, 0 },
+  { 0xA4A9CF1E, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3065E40, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA16B312F, 0 },          { 0xA0A0A0A1, 0 },
+  { 0x9FD809FE, 0 },          { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9E4CAD24, 0 },          { 0x9D89D89E, 0 },
+  { 0x9CC8E161, 0 },          { 0x9C09C09C, 0x9C09C09C },
+  { 0x9B4C6F9F, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x99D722DB, 0 },          { 0x991F1A51, 0x991F1A51 },
+  { 0x9868C80A, 0 },          { 0x97B425ED, 0x97B425ED },
+  { 0x97012E02, 0x97012E02 }, { 0x964FDA6C, 0x964FDA6C },
+  { 0x95A02568, 0x95A02568 }, { 0x94F2094F, 0x94F2094F },
+  { 0x94458094, 0x94458094 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x92F11384, 0x92F11384 }, { 0x92492492, 0x92492492 },
+  { 0x91A2B3C5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x905A3863, 0x905A3863 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F1779DA, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8DDA5202, 0x8DDA5202 }, { 0x8D3DCB09, 0 },
+  { 0x8CA29C04, 0x8CA29C04 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8B70344A, 0x8B70344A }, { 0x8AD8F2FC, 0 },
+  { 0x8A42F870, 0x8A42F870 }, { 0x89AE408A, 0 },
+  { 0x891AC73B, 0 },          { 0x88888889, 0 },
+  { 0x87F78088, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x86D90545, 0 },          { 0x864B8A7E, 0 },
+  { 0x85BF3761, 0x85BF3761 }, { 0x85340853, 0x85340853 },
+  { 0x84A9F9C8, 0x84A9F9C8 }, { 0x84210842, 0x84210842 },
+  { 0x83993052, 0x83993052 }, { 0x83126E98, 0 },
+  { 0x828CBFBF, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81848DA9, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80808081, 0 },          { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF00FF01, 0 },          { 0xFE03F810, 0 },
+  { 0xFD08E551, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB188566, 0 },          { 0xFA232CF3, 0 },
+  { 0xF92FB222, 0 },          { 0xF83E0F84, 0 },
+  { 0xF74E3FC3, 0 },          { 0xF6603D99, 0 },
+  { 0xF57403D6, 0 },          { 0xF4898D60, 0 },
+  { 0xF3A0D52D, 0 },          { 0xF2B9D649, 0 },
+  { 0xF1D48BCF, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF00F00F0, 0xF00F00F0 }, { 0xEF2EB720, 0 },
+  { 0xEE500EE5, 0xEE500EE5 }, { 0xED7303B6, 0 },
+  { 0xEC979119, 0 },          { 0xEBBDB2A6, 0 },
+  { 0xEAE56404, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9396520, 0 },          { 0xE865AC7C, 0 },
+  { 0xE79372E3, 0 },          { 0xE6C2B449, 0 },
+  { 0xE5F36CB0, 0xE5F36CB0 }, { 0xE525982B, 0 },
+  { 0xE45932D8, 0 },          { 0xE38E38E4, 0 },
+  { 0xE2C4A689, 0 },          { 0xE1FC780F, 0 },
+  { 0xE135A9CA, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xDFAC1F75, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE27EB2D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDCA8F159, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB2F171E, 0 },          { 0xDA740DA8, 0 },
+  { 0xD9BA4257, 0 },          { 0xD901B204, 0 },
+  { 0xD84A598F, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD6DF43FD, 0 },          { 0xD62B80D7, 0 },
+  { 0xD578E97D, 0 },          { 0xD4C77B04, 0 },
+  { 0xD417328A, 0 },          { 0xD3680D37, 0 },
+  { 0xD2BA083C, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD161543E, 0xD161543E }, { 0xD0B69FCC, 0 },
+  { 0xD00D00D0, 0xD00D00D0 }, { 0xCF6474A9, 0 },
+  { 0xCEBCF8BC, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCD712753, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC29786D, 0 },          { 0xCB8727C1, 0 },
+  { 0xCAE5D85F, 0xCAE5D85F }, { 0xCA4587E7, 0 },
+  { 0xC9A633FD, 0 },          { 0xC907DA4F, 0 },
+  { 0xC86A7890, 0xC86A7890 }, { 0xC7CE0C7D, 0 },
+  { 0xC73293D8, 0 },          { 0xC6980C6A, 0 },
+  { 0xC5FE7403, 0xC5FE7403 }, { 0xC565C87C, 0 },
+  { 0xC4CE07B0, 0xC4CE07B0 }, { 0xC4372F86, 0 },
+  { 0xC3A13DE6, 0xC3A13DE6 }, { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2780614, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC152500C, 0xC152500C }, { 0xC0C0C0C1, 0 },
+  { 0xC0300C03, 0xC0300C03 }, { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF112A8B, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBDF59C92, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBCDD535E, 0 },          { 0xBC52640C, 0 },
+  { 0xBBC8408D, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAB65610, 0xBAB65610 }, { 0xBA2E8BA3, 0 },
+  { 0xB9A7862A, 0xB9A7862A }, { 0xB92143FA, 0xB92143FA },
+  { 0xB89BC36D, 0 },          { 0xB81702E1, 0 },
+  { 0xB79300B8, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB68D3134, 0xB68D3134 }, { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB58A4855, 0xB58A4855 }, { 0xB509E68B, 0 },
+  { 0xB48A39D4, 0xB48A39D4 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB38CF9B0, 0xB38CF9B0 }, { 0xB30F6353, 0 },
+  { 0xB2927C2A, 0 },          { 0xB21642C9, 0 },
+  { 0xB19AB5C5, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0A59B42, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFB321A1, 0xAFB321A1 }, { 0xAF3ADDC7, 0 },
+  { 0xAEC33E20, 0 },          { 0xAE4C415D, 0 },
+  { 0xADD5E632, 0xADD5E632 }, { 0xAD602B58, 0xAD602B58 },
+  { 0xACEB0F89, 0xACEB0F89 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC02B00B, 0 },          { 0xAB8F69E3, 0 },
+  { 0xAB1CBDD4, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA392F36, 0 },          { 0xA9C84A48, 0 },
+  { 0xA957FAB5, 0xA957FAB5 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8791709, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA79C7B17, 0 },          { 0xA72F053A, 0 },
+  { 0xA6C21DF7, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA5E9F6ED, 0xA5E9F6ED }, { 0xA57EB503, 0 },
+  { 0xA513FD6C, 0 },          { 0xA4A9CF1E, 0 },
+  { 0xA4402910, 0xA4402910 }, { 0xA3D70A3E, 0 },
+  { 0xA36E71A3, 0 },          { 0xA3065E40, 0 },
+  { 0xA29ECF16, 0xA29ECF16 }, { 0xA237C32B, 0xA237C32B },
+  { 0xA1D13986, 0 },          { 0xA16B312F, 0 },
+  { 0xA105A933, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA03C1689, 0 },          { 0x9FD809FE, 0 },
+  { 0x9F747A15, 0x9F747A15 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EAECC8D, 0x9EAECC8D }, { 0x9E4CAD24, 0 },
+  { 0x9DEB06C9, 0x9DEB06C9 }, { 0x9D89D89E, 0 },
+  { 0x9D2921C4, 0 },          { 0x9CC8E161, 0 },
+  { 0x9C69169B, 0x9C69169B }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BAADE8E, 0x9BAADE8E }, { 0x9B4C6F9F, 0 },
+  { 0x9AEE72FD, 0 },          { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A33CD67, 0x9A33CD67 }, { 0x99D722DB, 0 },
+  { 0x997AE76B, 0x997AE76B }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98C3BAC7, 0x98C3BAC7 }, { 0x9868C80A, 0 },
+  { 0x980E4156, 0x980E4156 }, { 0x97B425ED, 0x97B425ED },
+  { 0x975A7510, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96A8500A, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x95F7CC73, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x9548E498, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x949B92DE, 0 },          { 0x94458094, 0x94458094 },
+  { 0x93EFD1C5, 0x93EFD1C5 }, { 0x939A85C4, 0x939A85C4 },
+  { 0x93459BE7, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x929CEBF5, 0 },          { 0x92492492, 0x92492492 },
+  { 0x91F5BCB9, 0 },          { 0x91A2B3C5, 0 },
+  { 0x91500915, 0x91500915 }, { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90ABCC02, 0x90ABCC02 }, { 0x905A3863, 0x905A3863 },
+  { 0x90090090, 0x90090090 }, { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F67A1E4, 0 },          { 0x8F1779DA, 0 },
+  { 0x8EC7AB3A, 0 },          { 0x8E78356D, 0x8E78356D },
+  { 0x8E2917E1, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8D8BE340, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8CF008CF, 0x8CF008CF }, { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C55841D, 0 },          { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BBC50C9, 0 },          { 0x8B70344A, 0x8B70344A },
+  { 0x8B246A88, 0 },          { 0x8AD8F2FC, 0 },
+  { 0x8A8DCD20, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x89F8746A, 0 },          { 0x89AE408A, 0 },
+  { 0x89645C4F, 0x89645C4F }, { 0x891AC73B, 0 },
+  { 0x88D180CD, 0x88D180CD }, { 0x88888889, 0 },
+  { 0x883FDDF0, 0x883FDDF0 }, { 0x87F78088, 0 },
+  { 0x87AF6FD6, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x872032AC, 0x872032AC }, { 0x86D90545, 0 },
+  { 0x869222B2, 0 },          { 0x864B8A7E, 0 },
+  { 0x86053C34, 0x86053C34 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x85797B91, 0x85797B91 }, { 0x85340853, 0x85340853 },
+  { 0x84EEDD36, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x84655D9C, 0 },          { 0x84210842, 0x84210842 },
+  { 0x83DCF94E, 0 },          { 0x83993052, 0x83993052 },
+  { 0x8355ACE4, 0 },          { 0x83126E98, 0 },
+  { 0x82CF7504, 0 },          { 0x828CBFBF, 0 },
+  { 0x824A4E61, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81C635BC, 0x81C635BC }, { 0x81848DA9, 0 },
+  { 0x814327E4, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80C121B3, 0 },          { 0x80808081, 0 },
+  { 0x80402010, 0x80402010 }, { 0xFFFFFFFF, 0xFFFFFFFF },
+  { 0xFF803FE1, 0 },          { 0xFF00FF01, 0 },
+  { 0xFE823CA6, 0 },          { 0xFE03F810, 0 },
+  { 0xFD863087, 0 },          { 0xFD08E551, 0 },
+  { 0xFC8C15B5, 0 },          { 0xFC0FC0FD, 0 },
+  { 0xFB93E673, 0 },          { 0xFB188566, 0 },
+  { 0xFA9D9D20, 0 },          { 0xFA232CF3, 0 },
+  { 0xF9A9342D, 0 },          { 0xF92FB222, 0 },
+  { 0xF8B6A622, 0xF8B6A622 }, { 0xF83E0F84, 0 },
+  { 0xF7C5ED9D, 0 },          { 0xF74E3FC3, 0 },
+  { 0xF6D7054E, 0 },          { 0xF6603D99, 0 },
+  { 0xF5E9E7FD, 0 },          { 0xF57403D6, 0 },
+  { 0xF4FE9083, 0 },          { 0xF4898D60, 0 },
+  { 0xF414F9CE, 0 },          { 0xF3A0D52D, 0 },
+  { 0xF32D1EE0, 0 },          { 0xF2B9D649, 0 },
+  { 0xF246FACC, 0 },          { 0xF1D48BCF, 0 },
+  { 0xF16288B9, 0 },          { 0xF0F0F0F1, 0 },
+  { 0xF07FC3E0, 0xF07FC3E0 }, { 0xF00F00F0, 0xF00F00F0 },
+  { 0xEF9EA78C, 0 },          { 0xEF2EB720, 0 },
+  { 0xEEBF2F19, 0 },          { 0xEE500EE5, 0xEE500EE5 },
+  { 0xEDE155F4, 0 },          { 0xED7303B6, 0 },
+  { 0xED05179C, 0xED05179C }, { 0xEC979119, 0 },
+  { 0xEC2A6FA0, 0xEC2A6FA0 }, { 0xEBBDB2A6, 0 },
+  { 0xEB5159A0, 0 },          { 0xEAE56404, 0 },
+  { 0xEA79D14A, 0 },          { 0xEA0EA0EA, 0xEA0EA0EA },
+  { 0xE9A3D25E, 0xE9A3D25E }, { 0xE9396520, 0 },
+  { 0xE8CF58AB, 0 },          { 0xE865AC7C, 0 },
+  { 0xE7FC600F, 0 },          { 0xE79372E3, 0 },
+  { 0xE72AE476, 0 },          { 0xE6C2B449, 0 },
+  { 0xE65AE1DC, 0 },          { 0xE5F36CB0, 0xE5F36CB0 },
+  { 0xE58C544A, 0 },          { 0xE525982B, 0 },
+  { 0xE4BF37D9, 0 },          { 0xE45932D8, 0 },
+  { 0xE3F388AF, 0 },          { 0xE38E38E4, 0 },
+  { 0xE32942FF, 0 },          { 0xE2C4A689, 0 },
+  { 0xE260630B, 0 },          { 0xE1FC780F, 0 },
+  { 0xE198E520, 0 },          { 0xE135A9CA, 0 },
+  { 0xE0D2C59A, 0 },          { 0xE070381C, 0xE070381C },
+  { 0xE00E00E0, 0xE00E00E0 }, { 0xDFAC1F75, 0 },
+  { 0xDF4A9369, 0 },          { 0xDEE95C4D, 0 },
+  { 0xDE8879B3, 0 },          { 0xDE27EB2D, 0 },
+  { 0xDDC7B04D, 0 },          { 0xDD67C8A6, 0xDD67C8A6 },
+  { 0xDD0833CE, 0 },          { 0xDCA8F159, 0 },
+  { 0xDC4A00DD, 0 },          { 0xDBEB61EF, 0 },
+  { 0xDB8D1428, 0 },          { 0xDB2F171E, 0 },
+  { 0xDAD16A6B, 0 },          { 0xDA740DA8, 0 },
+  { 0xDA17006D, 0xDA17006D }, { 0xD9BA4257, 0 },
+  { 0xD95DD300, 0 },          { 0xD901B204, 0 },
+  { 0xD8A5DEFF, 0 },          { 0xD84A598F, 0 },
+  { 0xD7EF2152, 0 },          { 0xD79435E5, 0xD79435E5 },
+  { 0xD73996E9, 0 },          { 0xD6DF43FD, 0 },
+  { 0xD6853CC1, 0 },          { 0xD62B80D7, 0 },
+  { 0xD5D20FDF, 0 },          { 0xD578E97D, 0 },
+  { 0xD5200D52, 0xD5200D52 }, { 0xD4C77B04, 0 },
+  { 0xD46F3235, 0 },          { 0xD417328A, 0 },
+  { 0xD3BF7BA9, 0 },          { 0xD3680D37, 0 },
+  { 0xD310E6DB, 0 },          { 0xD2BA083C, 0 },
+  { 0xD2637101, 0 },          { 0xD20D20D2, 0xD20D20D2 },
+  { 0xD1B71759, 0 },          { 0xD161543E, 0xD161543E },
+  { 0xD10BD72C, 0 },          { 0xD0B69FCC, 0 },
+  { 0xD061ADCA, 0 },          { 0xD00D00D0, 0xD00D00D0 },
+  { 0xCFB8988C, 0 },          { 0xCF6474A9, 0 },
+  { 0xCF1094D4, 0 },          { 0xCEBCF8BC, 0 },
+  { 0xCE69A00D, 0 },          { 0xCE168A77, 0xCE168A77 },
+  { 0xCDC3B7A9, 0xCDC3B7A9 }, { 0xCD712753, 0 },
+  { 0xCD1ED924, 0 },          { 0xCCCCCCCD, 0 },
+  { 0xCC7B0200, 0 },          { 0xCC29786D, 0 },
+  { 0xCBD82FC7, 0 },          { 0xCB8727C1, 0 },
+  { 0xCB36600D, 0 },          { 0xCAE5D85F, 0xCAE5D85F },
+  { 0xCA95906C, 0 },          { 0xCA4587E7, 0 },
+  { 0xC9F5BE86, 0 },          { 0xC9A633FD, 0 },
+  { 0xC956E803, 0xC956E803 }, { 0xC907DA4F, 0 },
+  { 0xC8B90A96, 0 },          { 0xC86A7890, 0xC86A7890 },
+  { 0xC81C23F5, 0xC81C23F5 }, { 0xC7CE0C7D, 0 },
+  { 0xC78031E0, 0xC78031E0 }, { 0xC73293D8, 0 },
+  { 0xC6E5321D, 0 },          { 0xC6980C6A, 0 },
+  { 0xC64B2278, 0xC64B2278 }, { 0xC5FE7403, 0xC5FE7403 },
+  { 0xC5B200C6, 0 },          { 0xC565C87C, 0 },
+  { 0xC519CAE0, 0xC519CAE0 }, { 0xC4CE07B0, 0xC4CE07B0 },
+  { 0xC4827EA8, 0xC4827EA8 }, { 0xC4372F86, 0 },
+  { 0xC3EC1A06, 0 },          { 0xC3A13DE6, 0xC3A13DE6 },
+  { 0xC3569AE6, 0 },          { 0xC30C30C3, 0xC30C30C3 },
+  { 0xC2C1FF3E, 0 },          { 0xC2780614, 0 },
+  { 0xC22E4507, 0 },          { 0xC1E4BBD6, 0 },
+  { 0xC19B6A42, 0 },          { 0xC152500C, 0xC152500C },
+  { 0xC1096CF6, 0 },          { 0xC0C0C0C1, 0 },
+  { 0xC0784B2F, 0 },          { 0xC0300C03, 0xC0300C03 },
+  { 0xBFE80300, 0 },          { 0xBFA02FE8, 0xBFA02FE8 },
+  { 0xBF589280, 0 },          { 0xBF112A8B, 0 },
+  { 0xBEC9F7CE, 0 },          { 0xBE82FA0C, 0 },
+  { 0xBE3C310C, 0 },          { 0xBDF59C92, 0 },
+  { 0xBDAF3C64, 0 },          { 0xBD691047, 0xBD691047 },
+  { 0xBD231803, 0 },          { 0xBCDD535E, 0 },
+  { 0xBC97C21E, 0xBC97C21E }, { 0xBC52640C, 0 },
+  { 0xBC0D38EE, 0xBC0D38EE }, { 0xBBC8408D, 0 },
+  { 0xBB837AB1, 0 },          { 0xBB3EE722, 0 },
+  { 0xBAFA85A9, 0xBAFA85A9 }, { 0xBAB65610, 0xBAB65610 },
+  { 0xBA725820, 0xBA725820 }, { 0xBA2E8BA3, 0 },
+  { 0xB9EAF063, 0 },          { 0xB9A7862A, 0xB9A7862A },
+  { 0xB9644CC4, 0 },          { 0xB92143FA, 0xB92143FA },
+  { 0xB8DE6B9A, 0 },          { 0xB89BC36D, 0 },
+  { 0xB8594B41, 0 },          { 0xB81702E1, 0 },
+  { 0xB7D4EA19, 0xB7D4EA19 }, { 0xB79300B8, 0 },
+  { 0xB7514689, 0 },          { 0xB70FBB5A, 0xB70FBB5A },
+  { 0xB6CE5EF9, 0xB6CE5EF9 }, { 0xB68D3134, 0xB68D3134 },
+  { 0xB64C31D9, 0 },          { 0xB60B60B6, 0xB60B60B6 },
+  { 0xB5CABD9B, 0 },          { 0xB58A4855, 0xB58A4855 },
+  { 0xB54A00B5, 0xB54A00B5 }, { 0xB509E68B, 0 },
+  { 0xB4C9F9A5, 0 },          { 0xB48A39D4, 0xB48A39D4 },
+  { 0xB44AA6E9, 0xB44AA6E9 }, { 0xB40B40B4, 0xB40B40B4 },
+  { 0xB3CC0706, 0 },          { 0xB38CF9B0, 0xB38CF9B0 },
+  { 0xB34E1884, 0 },          { 0xB30F6353, 0 },
+  { 0xB2D0D9EF, 0 },          { 0xB2927C2A, 0 },
+  { 0xB25449D7, 0 },          { 0xB21642C9, 0 },
+  { 0xB1D866D1, 0xB1D866D1 }, { 0xB19AB5C5, 0 },
+  { 0xB15D2F76, 0 },          { 0xB11FD3B8, 0xB11FD3B8 },
+  { 0xB0E2A260, 0xB0E2A260 }, { 0xB0A59B42, 0 },
+  { 0xB068BE31, 0 },          { 0xB02C0B03, 0 },
+  { 0xAFEF818C, 0 },          { 0xAFB321A1, 0xAFB321A1 },
+  { 0xAF76EB19, 0 },          { 0xAF3ADDC7, 0 },
+  { 0xAEFEF982, 0 },          { 0xAEC33E20, 0 },
+  { 0xAE87AB76, 0xAE87AB76 }, { 0xAE4C415D, 0 },
+  { 0xAE10FFA9, 0 },          { 0xADD5E632, 0xADD5E632 },
+  { 0xAD9AF4D0, 0 },          { 0xAD602B58, 0xAD602B58 },
+  { 0xAD2589A4, 0 },          { 0xACEB0F89, 0xACEB0F89 },
+  { 0xACB0BCE1, 0xACB0BCE1 }, { 0xAC769184, 0xAC769184 },
+  { 0xAC3C8D4A, 0 },          { 0xAC02B00B, 0 },
+  { 0xABC8F9A0, 0xABC8F9A0 }, { 0xAB8F69E3, 0 },
+  { 0xAB5600AC, 0 },          { 0xAB1CBDD4, 0 },
+  { 0xAAE3A136, 0 },          { 0xAAAAAAAB, 0 },
+  { 0xAA71DA0D, 0 },          { 0xAA392F36, 0 },
+  { 0xAA00AA01, 0 },          { 0xA9C84A48, 0 },
+  { 0xA9900FE6, 0 },          { 0xA957FAB5, 0xA957FAB5 },
+  { 0xA9200A92, 0xA9200A92 }, { 0xA8E83F57, 0xA8E83F57 },
+  { 0xA8B098E0, 0xA8B098E0 }, { 0xA8791709, 0 },
+  { 0xA841B9AD, 0 },          { 0xA80A80A8, 0xA80A80A8 },
+  { 0xA7D36BD8, 0 },          { 0xA79C7B17, 0 },
+  { 0xA765AE44, 0 },          { 0xA72F053A, 0 },
+  { 0xA6F87FD6, 0xA6F87FD6 }, { 0xA6C21DF7, 0 },
+  { 0xA68BDF79, 0 },          { 0xA655C439, 0xA655C439 },
+  { 0xA61FCC16, 0xA61FCC16 }, { 0xA5E9F6ED, 0xA5E9F6ED },
+  { 0xA5B4449D, 0 },          { 0xA57EB503, 0 },
+  { 0xA54947FE, 0 },          { 0xA513FD6C, 0 },
+  { 0xA4DED52C, 0xA4DED52C }, { 0xA4A9CF1E, 0 },
+  { 0xA474EB1F, 0xA474EB1F }, { 0xA4402910, 0xA4402910 },
+  { 0xA40B88D0, 0 },          { 0xA3D70A3E, 0 },
+  { 0xA3A2AD39, 0xA3A2AD39 }, { 0xA36E71A3, 0 },
+  { 0xA33A575A, 0xA33A575A }, { 0xA3065E40, 0 },
+  { 0xA2D28634, 0 },          { 0xA29ECF16, 0xA29ECF16 },
+  { 0xA26B38C9, 0 },          { 0xA237C32B, 0xA237C32B },
+  { 0xA2046E1F, 0xA2046E1F }, { 0xA1D13986, 0 },
+  { 0xA19E2540, 0 },          { 0xA16B312F, 0 },
+  { 0xA1385D35, 0 },          { 0xA105A933, 0 },
+  { 0xA0D3150C, 0 },          { 0xA0A0A0A1, 0 },
+  { 0xA06E4BD4, 0xA06E4BD4 }, { 0xA03C1689, 0 },
+  { 0xA00A00A0, 0xA00A00A0 }, { 0x9FD809FE, 0 },
+  { 0x9FA63284, 0 },          { 0x9F747A15, 0x9F747A15 },
+  { 0x9F42E095, 0x9F42E095 }, { 0x9F1165E7, 0x9F1165E7 },
+  { 0x9EE009EE, 0x9EE009EE }, { 0x9EAECC8D, 0x9EAECC8D },
+  { 0x9E7DADA9, 0 },          { 0x9E4CAD24, 0 },
+  { 0x9E1BCAE3, 0 },          { 0x9DEB06C9, 0x9DEB06C9 },
+  { 0x9DBA60BB, 0x9DBA60BB }, { 0x9D89D89E, 0 },
+  { 0x9D596E54, 0x9D596E54 }, { 0x9D2921C4, 0 },
+  { 0x9CF8F2D1, 0x9CF8F2D1 }, { 0x9CC8E161, 0 },
+  { 0x9C98ED58, 0 },          { 0x9C69169B, 0x9C69169B },
+  { 0x9C395D10, 0x9C395D10 }, { 0x9C09C09C, 0x9C09C09C },
+  { 0x9BDA4124, 0x9BDA4124 }, { 0x9BAADE8E, 0x9BAADE8E },
+  { 0x9B7B98C0, 0 },          { 0x9B4C6F9F, 0 },
+  { 0x9B1D6311, 0x9B1D6311 }, { 0x9AEE72FD, 0 },
+  { 0x9ABF9F48, 0x9ABF9F48 }, { 0x9A90E7D9, 0x9A90E7D9 },
+  { 0x9A624C97, 0 },          { 0x9A33CD67, 0x9A33CD67 },
+  { 0x9A056A31, 0 },          { 0x99D722DB, 0 },
+  { 0x99A8F74C, 0 },          { 0x997AE76B, 0x997AE76B },
+  { 0x994CF320, 0x994CF320 }, { 0x991F1A51, 0x991F1A51 },
+  { 0x98F15CE7, 0 },          { 0x98C3BAC7, 0x98C3BAC7 },
+  { 0x989633DB, 0x989633DB }, { 0x9868C80A, 0 },
+  { 0x983B773B, 0 },          { 0x980E4156, 0x980E4156 },
+  { 0x97E12644, 0x97E12644 }, { 0x97B425ED, 0x97B425ED },
+  { 0x97874039, 0 },          { 0x975A7510, 0 },
+  { 0x972DC45B, 0 },          { 0x97012E02, 0x97012E02 },
+  { 0x96D4B1EF, 0 },          { 0x96A8500A, 0 },
+  { 0x967C083B, 0 },          { 0x964FDA6C, 0x964FDA6C },
+  { 0x9623C686, 0x9623C686 }, { 0x95F7CC73, 0 },
+  { 0x95CBEC1B, 0 },          { 0x95A02568, 0x95A02568 },
+  { 0x95747844, 0 },          { 0x9548E498, 0 },
+  { 0x951D6A4E, 0 },          { 0x94F2094F, 0x94F2094F },
+  { 0x94C6C187, 0 },          { 0x949B92DE, 0 },
+  { 0x94707D3F, 0 },          { 0x94458094, 0x94458094 },
+  { 0x941A9CC8, 0x941A9CC8 }, { 0x93EFD1C5, 0x93EFD1C5 },
+  { 0x93C51F76, 0 },          { 0x939A85C4, 0x939A85C4 },
+  { 0x9370049C, 0 },          { 0x93459BE7, 0 },
+  { 0x931B4B91, 0 },          { 0x92F11384, 0x92F11384 },
+  { 0x92C6F3AC, 0x92C6F3AC }, { 0x929CEBF5, 0 },
+  { 0x9272FC48, 0x9272FC48 }, { 0x92492492, 0x92492492 },
+  { 0x921F64BF, 0 },          { 0x91F5BCB9, 0 },
+  { 0x91CC2C6C, 0x91CC2C6C }, { 0x91A2B3C5, 0 },
+  { 0x917952AF, 0 },          { 0x91500915, 0x91500915 },
+  { 0x9126D6E5, 0 },          { 0x90FDBC09, 0x90FDBC09 },
+  { 0x90D4B86F, 0 },          { 0x90ABCC02, 0x90ABCC02 },
+  { 0x9082F6B0, 0 },          { 0x905A3863, 0x905A3863 },
+  { 0x9031910A, 0 },          { 0x90090090, 0x90090090 },
+  { 0x8FE086E3, 0 },          { 0x8FB823EE, 0x8FB823EE },
+  { 0x8F8FD7A0, 0 },          { 0x8F67A1E4, 0 },
+  { 0x8F3F82A8, 0x8F3F82A8 }, { 0x8F1779DA, 0 },
+  { 0x8EEF8766, 0 },          { 0x8EC7AB3A, 0 },
+  { 0x8E9FE542, 0x8E9FE542 }, { 0x8E78356D, 0x8E78356D },
+  { 0x8E509BA8, 0x8E509BA8 }, { 0x8E2917E1, 0 },
+  { 0x8E01AA05, 0 },          { 0x8DDA5202, 0x8DDA5202 },
+  { 0x8DB30FC6, 0x8DB30FC6 }, { 0x8D8BE340, 0 },
+  { 0x8D64CC5C, 0 },          { 0x8D3DCB09, 0 },
+  { 0x8D16DF35, 0x8D16DF35 }, { 0x8CF008CF, 0x8CF008CF },
+  { 0x8CC947C5, 0 },          { 0x8CA29C04, 0x8CA29C04 },
+  { 0x8C7C057D, 0 },          { 0x8C55841D, 0 },
+  { 0x8C2F17D2, 0x8C2F17D2 }, { 0x8C08C08C, 0x8C08C08C },
+  { 0x8BE27E39, 0x8BE27E39 }, { 0x8BBC50C9, 0 },
+  { 0x8B963829, 0x8B963829 }, { 0x8B70344A, 0x8B70344A },
+  { 0x8B4A451A, 0 },          { 0x8B246A88, 0 },
+  { 0x8AFEA483, 0x8AFEA483 }, { 0x8AD8F2FC, 0 },
+  { 0x8AB355E0, 0x8AB355E0 }, { 0x8A8DCD20, 0 },
+  { 0x8A6858AB, 0 },          { 0x8A42F870, 0x8A42F870 },
+  { 0x8A1DAC60, 0x8A1DAC60 }, { 0x89F8746A, 0 },
+  { 0x89D3507D, 0 },          { 0x89AE408A, 0 },
+  { 0x89894480, 0 },          { 0x89645C4F, 0x89645C4F },
+  { 0x893F87E8, 0x893F87E8 }, { 0x891AC73B, 0 },
+  { 0x88F61A37, 0x88F61A37 }, { 0x88D180CD, 0x88D180CD },
+  { 0x88ACFAEE, 0 },          { 0x88888889, 0 },
+  { 0x8864298F, 0 },          { 0x883FDDF0, 0x883FDDF0 },
+  { 0x881BA59E, 0 },          { 0x87F78088, 0 },
+  { 0x87D36EA0, 0 },          { 0x87AF6FD6, 0 },
+  { 0x878B841B, 0 },          { 0x8767AB5F, 0x8767AB5F },
+  { 0x8743E595, 0 },          { 0x872032AC, 0x872032AC },
+  { 0x86FC9296, 0x86FC9296 }, { 0x86D90545, 0 },
+  { 0x86B58AA8, 0 },          { 0x869222B2, 0 },
+  { 0x866ECD53, 0x866ECD53 }, { 0x864B8A7E, 0 },
+  { 0x86285A23, 0x86285A23 }, { 0x86053C34, 0x86053C34 },
+  { 0x85E230A3, 0x85E230A3 }, { 0x85BF3761, 0x85BF3761 },
+  { 0x859C5060, 0x859C5060 }, { 0x85797B91, 0x85797B91 },
+  { 0x8556B8E7, 0x8556B8E7 }, { 0x85340853, 0x85340853 },
+  { 0x851169C7, 0x851169C7 }, { 0x84EEDD36, 0 },
+  { 0x84CC6290, 0 },          { 0x84A9F9C8, 0x84A9F9C8 },
+  { 0x8487A2D1, 0 },          { 0x84655D9C, 0 },
+  { 0x84432A1B, 0x84432A1B }, { 0x84210842, 0x84210842 },
+  { 0x83FEF802, 0x83FEF802 }, { 0x83DCF94E, 0 },
+  { 0x83BB0C18, 0 },          { 0x83993052, 0x83993052 },
+  { 0x837765F0, 0x837765F0 }, { 0x8355ACE4, 0 },
+  { 0x83340520, 0x83340520 }, { 0x83126E98, 0 },
+  { 0x82F0E93D, 0x82F0E93D }, { 0x82CF7504, 0 },
+  { 0x82AE11DE, 0 },          { 0x828CBFBF, 0 },
+  { 0x826B7E99, 0x826B7E99 }, { 0x824A4E61, 0 },
+  { 0x82292F08, 0 },          { 0x82082082, 0x82082082 },
+  { 0x81E722C2, 0x81E722C2 }, { 0x81C635BC, 0x81C635BC },
+  { 0x81A55963, 0 },          { 0x81848DA9, 0 },
+  { 0x8163D283, 0 },          { 0x814327E4, 0 },
+  { 0x81228DBF, 0 },          { 0x81020408, 0x81020408 },
+  { 0x80E18AB3, 0 },          { 0x80C121B3, 0 },
+  { 0x80A0C8FB, 0x80A0C8FB }, { 0x80808081, 0 },
+  { 0x80604836, 0x80604836 }, { 0x80402010, 0x80402010 },
+  { 0x80200802, 0x80200802 }, { 0xFFFFFFFF, 0xFFFFFFFF }
+};
diff --git a/third_party/aom/aom_dsp/odintrin.h b/third_party/aom/aom_dsp/odintrin.h
new file mode 100644
index 0000000000..9e4ba5029a
--- /dev/null
+++ b/third_party/aom/aom_dsp/odintrin.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2001-2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/* clang-format off */
+
+#ifndef AOM_AOM_DSP_ODINTRIN_H_
+#define AOM_AOM_DSP_ODINTRIN_H_
+
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_ports/bitops.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int od_coeff;
+
+#define OD_DIVU_DMAX (1024)
+
+extern uint32_t OD_DIVU_SMALL_CONSTS[OD_DIVU_DMAX][2];
+
+/*Translate unsigned division by small divisors into multiplications.*/
+#define OD_DIVU_SMALL(_x, _d)                                     \
+  ((uint32_t)((OD_DIVU_SMALL_CONSTS[(_d)-1][0] * (uint64_t)(_x) + \
+               OD_DIVU_SMALL_CONSTS[(_d)-1][1]) >>                \
+              32) >>                                              \
+   (OD_ILOG_NZ(_d) - 1))
+
+#define OD_DIVU(_x, _d) \
+  (((_d) < OD_DIVU_DMAX) ? (OD_DIVU_SMALL((_x), (_d))) : ((_x) / (_d)))
+
+#define OD_MINI AOMMIN
+#define OD_MAXI AOMMAX
+#define OD_CLAMPI(min, val, max) (OD_MAXI(min, OD_MINI(val, max)))
+
+/*Integer logarithm (base 2) of a nonzero unsigned 32-bit integer.
+  OD_ILOG_NZ(x) = (int)floor(log2(x)) + 1.*/
+#define OD_ILOG_NZ(x) (1 + get_msb(x))
+
+/*Enable special features for gcc and compatible compilers.*/
+#if defined(__GNUC__) && defined(__GNUC_MINOR__) && defined(__GNUC_PATCHLEVEL__)
+#define OD_GNUC_PREREQ(maj, min, pat)                                \
+  ((__GNUC__ << 16) + (__GNUC_MINOR__ << 8) + __GNUC_PATCHLEVEL__ >= \
+   ((maj) << 16) + ((min) << 8) + pat)  // NOLINT
+#else
+#define OD_GNUC_PREREQ(maj, min, pat) (0)
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_WARN_UNUSED_RESULT __attribute__((__warn_unused_result__))
+#else
+#define OD_WARN_UNUSED_RESULT
+#endif
+
+#if OD_GNUC_PREREQ(3, 4, 0)
+#define OD_ARG_NONNULL(x) __attribute__((__nonnull__(x)))
+#else
+#define OD_ARG_NONNULL(x)
+#endif
+
+/*All of these macros should expect floats as arguments.*/
+# define OD_SIGNMASK(a) (-((a) < 0))
+# define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b))
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_ODINTRIN_H_
diff --git a/third_party/aom/aom_dsp/prob.h b/third_party/aom/aom_dsp/prob.h
new file mode 100644
index 0000000000..5711a40a40
--- /dev/null
+++ b/third_party/aom/aom_dsp/prob.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PROB_H_
+#define AOM_AOM_DSP_PROB_H_
+
+#include <assert.h>
+#include <stdio.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/entcode.h"
+#include "aom_ports/bitops.h"
+#include "aom_ports/mem.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef uint16_t aom_cdf_prob;
+
+#define CDF_SIZE(x) ((x) + 1)
+#define CDF_PROB_BITS 15
+#define CDF_PROB_TOP (1 << CDF_PROB_BITS)
+/*The value stored in an iCDF is CDF_PROB_TOP minus the actual cumulative
+  probability (an "inverse" CDF).
+  This function converts from one representation to the other (and is its own
+  inverse).*/
+#define AOM_ICDF(x) (CDF_PROB_TOP - (x))
+
+#define AOM_CDF2(a0) AOM_ICDF(a0), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF3(a0, a1) AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF4(a0, a1, a2) \
+  AOM_ICDF(a0), AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF5(a0, a1, a2, a3) \
+  AOM_ICDF(a0)                   \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF6(a0, a1, a2, a3, a4)                        \
+  AOM_ICDF(a0)                                              \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF7(a0, a1, a2, a3, a4, a5)                                  \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF8(a0, a1, a2, a3, a4, a5, a6)                              \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF9(a0, a1, a2, a3, a4, a5, a6, a7)                          \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF10(a0, a1, a2, a3, a4, a5, a6, a7, a8)                     \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF11(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9)                 \
+  AOM_ICDF(a0)                                                            \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5), \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF12(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10)               \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF13(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11)          \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF14(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12)     \
+  AOM_ICDF(a0)                                                               \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),    \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10), \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF15(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13) \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(CDF_PROB_TOP), 0
+#define AOM_CDF16(a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, \
+                  a14)                                                        \
+  AOM_ICDF(a0)                                                                \
+  , AOM_ICDF(a1), AOM_ICDF(a2), AOM_ICDF(a3), AOM_ICDF(a4), AOM_ICDF(a5),     \
+      AOM_ICDF(a6), AOM_ICDF(a7), AOM_ICDF(a8), AOM_ICDF(a9), AOM_ICDF(a10),  \
+      AOM_ICDF(a11), AOM_ICDF(a12), AOM_ICDF(a13), AOM_ICDF(a14),             \
+      AOM_ICDF(CDF_PROB_TOP), 0
+
+static INLINE uint8_t get_prob(unsigned int num, unsigned int den) {
+  assert(den != 0);
+  {
+    const int p = (int)(((uint64_t)num * 256 + (den >> 1)) / den);
+    // (p > 255) ? 255 : (p < 1) ? 1 : p;
+    const int clipped_prob = p | ((255 - p) >> 23) | (p == 0);
+    return (uint8_t)clipped_prob;
+  }
+}
+
+static INLINE void update_cdf(aom_cdf_prob *cdf, int8_t val, int nsymbs) {
+  assert(nsymbs < 17);
+  const int count = cdf[nsymbs];
+
+  // rate is computed in the spec as:
+  //  3 + ( cdf[N] > 15 ) + ( cdf[N] > 31 ) + Min(FloorLog2(N), 2)
+  // In this case cdf[N] is |count|.
+  // Min(FloorLog2(N), 2) is 1 for nsymbs == {2, 3} and 2 for all
+  // nsymbs > 3. So the equation becomes:
+  //  4 + (count > 15) + (count > 31) + (nsymbs > 3).
+  // Note that the largest value for count is 32 (it is not incremented beyond
+  // 32). So using that information:
+  //  count >> 4 is 0 for count from 0 to 15.
+  //  count >> 4 is 1 for count from 16 to 31.
+  //  count >> 4 is 2 for count == 31.
+  // Now, the equation becomes:
+  //  4 + (count >> 4) + (nsymbs > 3).
+  const int rate = 4 + (count >> 4) + (nsymbs > 3);
+
+  int i = 0;
+  do {
+    if (i < val) {
+      cdf[i] += (CDF_PROB_TOP - cdf[i]) >> rate;
+    } else {
+      cdf[i] -= cdf[i] >> rate;
+    }
+  } while (++i < nsymbs - 1);
+  cdf[nsymbs] += (count < 32);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_PROB_H_
diff --git a/third_party/aom/aom_dsp/psnr.c b/third_party/aom/aom_dsp/psnr.c
new file mode 100644
index 0000000000..cf0de29945
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.c
@@ -0,0 +1,454 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_scale/yv12config.h"
+
+double aom_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
+
+static int64_t encoder_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                           int b_stride, int w, int h) {
+  int i, j;
+  int64_t sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t encoder_highbd_sse(const uint8_t *a8, int a_stride,
+                                  const uint8_t *b8, int b_stride, int w,
+                                  int h) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t sse = 0;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  int x, y;
+
+  if (dw > 0) {
+    total_sse += encoder_sse(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                             dw, height);
+  }
+
+  if (dh > 0) {
+    total_sse +=
+        encoder_sse(&a[(height - dh) * a_stride], a_stride,
+                    &b[(height - dh) * b_stride], b_stride, width - dw, dh);
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      total_sse += aom_sse(pa, a_stride, pb, b_stride, 16, 16);
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride, int width,
+                                    int height, unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride, const uint8_t *b,
+                              int b_stride, int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+
+  if (dw > 0) {
+    total_sse += encoder_highbd_sse(&a[width - dw], a_stride, &b[width - dw],
+                                    b_stride, dw, height);
+  }
+  if (dh > 0) {
+    total_sse += encoder_highbd_sse(&a[(height - dh) * a_stride], a_stride,
+                                    &b[(height - dh) * b_stride], b_stride,
+                                    width - dw, dh);
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      total_sse += aom_highbd_sse(pa, a_stride, pb, b_stride, 16, 16);
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                       width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->u_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height) {
+  return aom_var_2d_u8(a->v_buffer + vstart * a->uv_stride + hstart,
+                       a->uv_stride, width, height) /
+         (width * height);
+}
+
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+                 b->y_buffer + vstart * b->y_stride + hstart, b->y_stride,
+                 width, height);
+}
+
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                 a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->u_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->u_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height) {
+  return get_sse(a->v_buffer + vstart * a->uv_stride + hstart, a->uv_stride,
+                 b->v_buffer + vstart * b->uv_stride + hstart, b->uv_stride,
+                 width, height);
+}
+
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+
+  return get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                 a->uv_crop_width, a->uv_crop_height);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->y_buffer + vstart * a->y_stride + hstart,
+                        a->y_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height) {
+  return aom_var_2d_u16(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride, width, height) /
+         (width * height);
+}
+
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(
+      a->y_buffer + vstart * a->y_stride + hstart, a->y_stride,
+      b->y_buffer + vstart * b->y_stride + hstart, b->y_stride, width, height);
+}
+
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+                        a->y_crop_width, a->y_crop_height);
+}
+
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->u_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->u_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->u_buffer, a->uv_stride, b->u_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height) {
+  return highbd_get_sse(a->v_buffer + vstart * a->uv_stride + hstart,
+                        a->uv_stride,
+                        b->v_buffer + vstart * b->uv_stride + hstart,
+                        b->uv_stride, width, height);
+}
+
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b) {
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->v_buffer, a->uv_stride, b->v_buffer, b->uv_stride,
+                        a->uv_crop_width, a->uv_crop_height);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (highbd) {
+    switch (plane) {
+      case 0: return aom_highbd_get_y_sse(a, b);
+      case 1: return aom_highbd_get_u_sse(a, b);
+      case 2: return aom_highbd_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  } else {
+    switch (plane) {
+      case 0: return aom_get_y_sse(a, b);
+      case 1: return aom_get_u_sse(a, b);
+      case 2: return aom_get_v_sse(a, b);
+      default: assert(plane >= 0 && plane <= 2); return 0;
+    }
+  }
+#else
+  (void)highbd;
+  switch (plane) {
+    case 0: return aom_get_y_sse(a, b);
+    case 1: return aom_get_u_sse(a, b);
+    case 2: return aom_get_v_sse(a, b);
+    default: assert(plane >= 0 && plane <= 2); return 0;
+  }
+#endif
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          uint32_t bit_depth, uint32_t in_bit_depth) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+#if CONFIG_LIBVMAF_PSNR_PEAK
+  double peak = (double)(255 << (in_bit_depth - 8));
+#else
+  double peak = (double)((1 << in_bit_depth) - 1);
+#endif  // CONFIG_LIBVMAF_PSNR_PEAK
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a->buffers[i], a_strides[i], b->buffers[i],
+                                   b_strides[i], w, h, input_shift);
+      } else {
+        sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                             b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w,
+                    h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+
+  // Compute PSNR based on stream bit depth
+  if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) {
+#if CONFIG_LIBVMAF_PSNR_PEAK
+    peak = (double)(255 << (bit_depth - 8));
+#else
+    peak = (double)((1 << bit_depth) - 1);
+#endif  // CONFIG_LIBVMAF_PSNR_PEAK
+    total_sse = 0;
+    total_samples = 0;
+    for (i = 0; i < 3; ++i) {
+      const int w = widths[i];
+      const int h = heights[i];
+      const uint32_t samples = w * h;
+      uint64_t sse;
+      sse = highbd_get_sse(a->buffers[i], a_strides[i], b->buffers[i],
+                           b_strides[i], w, h);
+      psnr->sse_hbd[1 + i] = sse;
+      psnr->samples_hbd[1 + i] = samples;
+      psnr->psnr_hbd[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+      total_sse += sse;
+      total_samples += samples;
+    }
+
+    psnr->sse_hbd[0] = total_sse;
+    psnr->samples_hbd[0] = total_samples;
+    psnr->psnr_hbd[0] =
+        aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+  }
+}
+#endif
+
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert(a->uv_crop_width == b->uv_crop_width);
+  assert(a->uv_crop_height == b->uv_crop_height);
+  static const double peak = 255.0;
+  const int widths[3] = { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = { a->y_crop_height, a->uv_crop_height,
+                           a->uv_crop_height };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse =
+        get_sse(a->buffers[i], a_strides[i], b->buffers[i], b_strides[i], w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = aom_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] =
+      aom_sse_to_psnr((double)total_samples, peak, (double)total_sse);
+}
diff --git a/third_party/aom/aom_dsp/psnr.h b/third_party/aom/aom_dsp/psnr.h
new file mode 100644
index 0000000000..afe6e08856
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnr.h
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PSNR_H_
+#define AOM_AOM_DSP_PSNR_H_
+
+#include "aom_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double psnr[4];           // total/y/u/v
+  uint64_t sse[4];          // total/y/u/v
+  uint32_t samples[4];      // total/y/u/v
+  double psnr_hbd[4];       // total/y/u/v when input-bit-depth < bit-depth
+  uint64_t sse_hbd[4];      // total/y/u/v when input-bit-depth < bit-depth
+  uint32_t samples_hbd[4];  // total/y/u/v when input-bit-depth < bit-depth
+} PSNR_STATS;
+
+/*!\brief Converts SSE to PSNR
+ *
+ * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR).
+ *
+ * \param[in]    samples       Number of samples
+ * \param[in]    peak          Max sample value
+ * \param[in]    sse           Sum of squared errors
+ */
+double aom_sse_to_psnr(double samples, double peak, double sse);
+uint64_t aom_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+uint64_t aom_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart, int width,
+                       int vstart, int height);
+int64_t aom_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_u_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                           const YV12_BUFFER_CONFIG *b, int hstart, int width,
+                           int vstart, int height);
+int64_t aom_get_v_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
+int64_t aom_get_sse_plane(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, int plane, int highbd);
+#if CONFIG_AV1_HIGHBITDEPTH
+uint64_t aom_highbd_get_y_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_u_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+uint64_t aom_highbd_get_v_var(const YV12_BUFFER_CONFIG *a, int hstart,
+                              int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_u_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_u_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+int64_t aom_highbd_get_v_sse_part(const YV12_BUFFER_CONFIG *a,
+                                  const YV12_BUFFER_CONFIG *b, int hstart,
+                                  int width, int vstart, int height);
+int64_t aom_highbd_get_v_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+void aom_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b, PSNR_STATS *psnr,
+                          unsigned int bit_depth, unsigned int in_bit_depth);
+#endif
+void aom_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr);
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest, double *phvs_y,
+                   double *phvs_u, double *phvs_v, uint32_t bd, uint32_t in_bd);
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // AOM_AOM_DSP_PSNR_H_
diff --git a/third_party/aom/aom_dsp/psnrhvs.c b/third_party/aom/aom_dsp/psnrhvs.c
new file mode 100644
index 0000000000..966ba007ed
--- /dev/null
+++ b/third_party/aom/aom_dsp/psnrhvs.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ *
+ *  This code was originally written by: Gregory Maxwell, at the Daala
+ *  project.
+ */
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/psnr.h"
+#include "aom_dsp/ssim.h"
+
+static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                               int xstride) {
+  int i, j;
+  (void)xstride;
+  aom_highbd_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j < 8; j++)
+      *(y + ystride * i + j) = (*(y + ystride * i + j) + 4) >> 3;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+/* Normalized inverse quantization matrix for 8x8 DCT at the point of
+ * transparency. This is not the JPEG based matrix from the paper,
+ this one gives a slightly higher MOS agreement.*/
+static const double csf_y[8][8] = {
+  { 1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
+    0.678296995242, 0.466224900598, 0.3265091542 },
+  { 2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
+    0.868920337363, 0.61280991668, 0.436405793551 },
+  { 2.08509755623, 2.04793073064, 1.34329019223, 1.09205635862, 0.875748795257,
+    0.670882927016, 0.501731932449, 0.372504254596 },
+  { 1.48366094411, 1.68731108984, 1.09205635862, 0.772819797575, 0.605636379554,
+    0.48309405692, 0.380429446972, 0.295774038565 },
+  { 1.00227514334, 1.2305666963, 0.875748795257, 0.605636379554, 0.448996256676,
+    0.352889268808, 0.283006984131, 0.226951348204 },
+  { 0.678296995242, 0.868920337363, 0.670882927016, 0.48309405692,
+    0.352889268808, 0.27032073436, 0.215017739696, 0.17408067321 },
+  { 0.466224900598, 0.61280991668, 0.501731932449, 0.380429446972,
+    0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001 },
+  { 0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
+    0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276 }
+};
+static const double csf_cb420[8][8] = {
+  { 1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
+    0.898018824055, 0.74725392039, 0.615105596242 },
+  { 2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
+    1.17428548929, 0.996404342439, 0.830890433625 },
+  { 1.18284184739, 1.21363250036, 0.978712413627, 1.02624506078, 1.03145147362,
+    0.960060382087, 0.849823426169, 0.731221236837 },
+  { 1.14982565193, 1.38190029285, 1.02624506078, 0.861317501629, 0.801821139099,
+    0.751437590932, 0.685398513368, 0.608694761374 },
+  { 1.05017074788, 1.33100189972, 1.03145147362, 0.801821139099, 0.676555426187,
+    0.605503172737, 0.55002013668, 0.495804539034 },
+  { 0.898018824055, 1.17428548929, 0.960060382087, 0.751437590932,
+    0.605503172737, 0.514674450957, 0.454353482512, 0.407050308965 },
+  { 0.74725392039, 0.996404342439, 0.849823426169, 0.685398513368,
+    0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733 },
+  { 0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
+    0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237 }
+};
+static const double csf_cr420[8][8] = {
+  { 2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
+    0.867069376285, 0.721500455585, 0.593906509971 },
+  { 2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
+    1.13381474809, 0.962064122248, 0.802254508198 },
+  { 1.26180942886, 1.17180569821, 0.944981930573, 0.990876405848,
+    0.995903384143, 0.926972725286, 0.820534991409, 0.706020324706 },
+  { 1.11019789803, 1.3342742857, 0.990876405848, 0.831632933426, 0.77418706195,
+    0.725539939514, 0.661776842059, 0.587716619023 },
+  { 1.01397751469, 1.28513006198, 0.995903384143, 0.77418706195, 0.653238524286,
+    0.584635025748, 0.531064164893, 0.478717061273 },
+  { 0.867069376285, 1.13381474809, 0.926972725286, 0.725539939514,
+    0.584635025748, 0.496936637883, 0.438694579826, 0.393021669543 },
+  { 0.721500455585, 0.962064122248, 0.820534991409, 0.661776842059,
+    0.531064164893, 0.438694579826, 0.375820256136, 0.330555063063 },
+  { 0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
+    0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658 }
+};
+
+static double convert_score_db(double _score, double _weight, int16_t pix_max) {
+  assert(_score * _weight >= 0.0);
+
+  if (_weight * _score < pix_max * pix_max * 1e-10) return MAX_PSNR;
+  return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
+}
+
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+                           const unsigned char *dst, int _dystride, double _par,
+                           int _w, int _h, int _step, const double _csf[8][8],
+                           uint32_t _shift, int buf_is_hbd, int16_t pix_max,
+                           int luma) {
+  double ret;
+  const uint8_t *_src8 = src;
+  const uint8_t *_dst8 = dst;
+  const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
+  DECLARE_ALIGNED(16, int16_t, dct_s[8 * 8]);
+  DECLARE_ALIGNED(16, int16_t, dct_d[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_s_coef[8 * 8]);
+  DECLARE_ALIGNED(16, tran_low_t, dct_d_coef[8 * 8]);
+  double mask[8][8];
+  int pixels;
+  int x;
+  int y;
+  float sum1;
+  float sum2;
+  float delt;
+  (void)_par;
+  ret = pixels = 0;
+  sum1 = sum2 = delt = 0.0f;
+  for (y = 0; y < _h; y++) {
+    for (x = 0; x < _w; x++) {
+      if (!buf_is_hbd) {
+        sum1 += _src8[y * _systride + x];
+        sum2 += _dst8[y * _dystride + x];
+      } else {
+        sum1 += _src16[y * _systride + x] >> _shift;
+        sum2 += _dst16[y * _dystride + x] >> _shift;
+      }
+    }
+  }
+  if (luma) delt = (sum1 - sum2) / (_w * _h);
+  /*In the PSNR-HVS-M paper[1] the authors describe the construction of
+   their masking table as "we have used the quantization table for the
+   color component Y of JPEG [6] that has been also obtained on the
+   basis of CSF. Note that the values in quantization table JPEG have
+   been normalized and then squared." Their CSF matrix (from PSNR-HVS)
+   was also constructed from the JPEG matrices. I can not find any obvious
+   scheme of normalizing to produce their table, but if I multiply their
+   CSF by 0.3885746225901003 and square the result I get their masking table.
+   I have no idea where this constant comes from, but deviating from it
+   too greatly hurts MOS agreement.
+
+   [1] Nikolay Ponomarenko, Flavia Silvestri, Karen Egiazarian, Marco Carli,
+   Jaakko Astola, Vladimir Lukin, "On between-coefficient contrast masking
+   of DCT basis functions", CD-ROM Proceedings of the Third
+   International Workshop on Video Processing and Quality Metrics for Consumer
+   Electronics VPQM-07, Scottsdale, Arizona, USA, 25-26 January, 2007, 4 p.
+
+   Suggested in aomedia issue#2363:
+   0.3885746225901003 is a reciprocal of the maximum coefficient (2.573509)
+   of the old JPEG based matrix from the paper. Since you are not using that,
+   divide by actual maximum coefficient. */
+  for (x = 0; x < 8; x++)
+    for (y = 0; y < 8; y++)
+      mask[x][y] = (_csf[x][y] / _csf[1][0]) * (_csf[x][y] / _csf[1][0]);
+  for (y = 0; y < _h - 7; y += _step) {
+    for (x = 0; x < _w - 7; x += _step) {
+      int i;
+      int j;
+      int n = 0;
+      double s_gx = 0;
+      double s_gy = 0;
+      double g = 0;
+      double s_gmean = 0;
+      double s_gvar = 0;
+      double s_mask = 0;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          if (!buf_is_hbd) {
+            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+          } else {
+            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+          }
+          dct_d[i * 8 + j] += (int)(delt + 0.5f);
+        }
+      }
+      for (i = 1; i < 7; i++) {
+        for (j = 1; j < 7; j++) {
+          s_gx = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i - 1) * 8 + j + 1] * 3 + dct_s[i * 8 + j - 1] * 10 -
+                  dct_s[i * 8 + j + 1] * 10 + dct_s[(i + 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          s_gy = (dct_s[(i - 1) * 8 + j - 1] * 3 -
+                  dct_s[(i + 1) * 8 + j - 1] * 3 + dct_s[(i - 1) * 8 + j] * 10 -
+                  dct_s[(i + 1) * 8 + j] * 10 + dct_s[(i - 1) * 8 + j + 1] * 3 -
+                  dct_s[(i + 1) * 8 + j + 1] * 3) /
+                 (pix_max * 16.f);
+          g = sqrt(s_gx * s_gx + s_gy * s_gy);
+          if (g > 0.1f) n++;
+          s_gmean += g;
+        }
+      }
+      s_gvar = 1.f / (36 - n + 1) * s_gmean / 36.f;
+#if CONFIG_AV1_HIGHBITDEPTH
+      if (!buf_is_hbd) {
+        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      } else {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      }
+#else
+      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+      for (i = 0; i < 8; i++)
+        for (j = (i == 0); j < 8; j++)
+          s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
+      s_mask = sqrt(s_mask * s_gvar) / 8.f;
+      for (i = 0; i < 8; i++) {
+        for (j = 0; j < 8; j++) {
+          double err;
+          err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+          if (i != 0 || j != 0)
+            err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
+          ret += (err * _csf[i][j]) * (err * _csf[i][j]);
+          pixels++;
+        }
+      }
+    }
+  }
+  if (pixels <= 0) return 0;
+  ret /= pixels;
+  ret += 0.04 * delt * delt;
+  return ret;
+}
+
+double aom_psnrhvs(const YV12_BUFFER_CONFIG *src, const YV12_BUFFER_CONFIG *dst,
+                   double *y_psnrhvs, double *u_psnrhvs, double *v_psnrhvs,
+                   uint32_t bd, uint32_t in_bd) {
+  double psnrhvs;
+  const double par = 1.0;
+  const int step = 7;
+  uint32_t bd_shift = 0;
+  assert(bd == 8 || bd == 10 || bd == 12);
+  assert(bd >= in_bd);
+  assert(src->flags == dst->flags);
+  const int buf_is_hbd = src->flags & YV12_FLAG_HIGHBITDEPTH;
+
+  int16_t pix_max = 255;
+  if (in_bd == 10)
+    pix_max = 1023;
+  else if (in_bd == 12)
+    pix_max = 4095;
+
+  bd_shift = bd - in_bd;
+
+  *y_psnrhvs =
+      calc_psnrhvs(src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride,
+                   par, src->y_crop_width, src->y_crop_height, step, csf_y,
+                   bd_shift, buf_is_hbd, pix_max, 1);
+  *u_psnrhvs =
+      calc_psnrhvs(src->u_buffer, src->uv_stride, dst->u_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cb420, bd_shift, buf_is_hbd, pix_max, 0);
+  *v_psnrhvs =
+      calc_psnrhvs(src->v_buffer, src->uv_stride, dst->v_buffer, dst->uv_stride,
+                   par, src->uv_crop_width, src->uv_crop_height, step,
+                   csf_cr420, bd_shift, buf_is_hbd, pix_max, 0);
+  psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
+  return convert_score_db(psnrhvs, 1.0, pix_max);
+}
diff --git a/third_party/aom/aom_dsp/pyramid.c b/third_party/aom/aom_dsp/pyramid.c
new file mode 100644
index 0000000000..324a18baea
--- /dev/null
+++ b/third_party/aom/aom_dsp/pyramid.c
@@ -0,0 +1,414 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/pyramid.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/bitops.h"
+#include "aom_util/aom_thread.h"
+
+// TODO(rachelbarker): Move needed code from av1/ to aom_dsp/
+#include "av1/common/resize.h"
+
+#include <assert.h>
+#include <string.h>
+
+// Lifecycle:
+// * Frame buffer alloc code calls aom_get_pyramid_alloc_size()
+//   to work out how much space is needed for a given number of pyramid
+//   levels. This is counted in the size checked against the max allocation
+//   limit
+// * Then calls aom_alloc_pyramid() to actually create the pyramid
+// * Pyramid is initially marked as invalid (no data)
+// * Whenever pyramid is needed, we check the valid flag. If set, use existing
+//   data. If not set, compute full pyramid
+// * Whenever frame buffer is reused, clear the valid flag
+// * Whenever frame buffer is resized, reallocate pyramid
+
+size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels,
+                                  bool image_is_16bit) {
+  // Limit number of levels on small frames
+  const int msb = get_msb(AOMMIN(width, height));
+  const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+  n_levels = AOMMIN(n_levels, max_levels);
+
+  size_t alloc_size = 0;
+  alloc_size += sizeof(ImagePyramid);
+  alloc_size += n_levels * sizeof(PyramidLayer);
+
+  // Calculate how much memory is needed for downscaled frame buffers
+  size_t buffer_size = 0;
+
+  // Work out if we need to allocate a few extra bytes for alignment.
+  // aom_memalign() will ensure that the start of the allocation is aligned
+  // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel
+  // to be aligned, not the first byte of the allocation.
+  //
+  // In the loop below, we ensure that the stride of every image is a multiple
+  // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will
+  // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the
+  // first pixel in the first pyramid layer aligned properly, that will
+  // automatically mean that the first pixel of every row of every layer is
+  // properly aligned too.
+  //
+  // Thus all we need to consider is the first pixel in the first layer.
+  // This is located at offset
+  //   extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING
+  // bytes into the buffer. Since level_stride is a multiple of
+  // PYRAMID_ALIGNMENT, we can ignore that. So we need
+  //   extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT
+  //
+  // To solve this, we can round PYRAMID_PADDING up to the next multiple
+  // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate
+  // how many extra bytes are needed.
+  size_t first_px_offset =
+      (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+  size_t extra_bytes = first_px_offset - PYRAMID_PADDING;
+  buffer_size += extra_bytes;
+
+  // If the original image is stored in an 8-bit buffer, then we can point the
+  // lowest pyramid level at that buffer rather than allocating a new one.
+  int first_allocated_level = image_is_16bit ? 0 : 1;
+
+  for (int level = first_allocated_level; level < n_levels; level++) {
+    int level_width = width >> level;
+    int level_height = height >> level;
+
+    // Allocate padding for each layer
+    int padded_width = level_width + 2 * PYRAMID_PADDING;
+    int padded_height = level_height + 2 * PYRAMID_PADDING;
+
+    // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT
+    // This ensures that, as long as the top-left pixel in this pyramid level is
+    // properly aligned, then so will the leftmost pixel in every row of the
+    // pyramid level.
+    int level_stride =
+        (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+
+    buffer_size += level_stride * padded_height;
+  }
+
+  alloc_size += buffer_size;
+
+  return alloc_size;
+}
+
+ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
+                                bool image_is_16bit) {
+  // Limit number of levels on small frames
+  const int msb = get_msb(AOMMIN(width, height));
+  const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1);
+  n_levels = AOMMIN(n_levels, max_levels);
+
+  ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr));
+  if (!pyr) {
+    return NULL;
+  }
+
+  pyr->layers = aom_calloc(n_levels, sizeof(*pyr->layers));
+  if (!pyr->layers) {
+    aom_free(pyr);
+    return NULL;
+  }
+
+  pyr->valid = false;
+  pyr->n_levels = n_levels;
+
+  // Compute sizes and offsets for each pyramid level
+  // These are gathered up first, so that we can allocate all pyramid levels
+  // in a single buffer
+  size_t buffer_size = 0;
+  size_t *layer_offsets = aom_calloc(n_levels, sizeof(*layer_offsets));
+  if (!layer_offsets) {
+    aom_free(pyr->layers);
+    aom_free(pyr);
+    return NULL;
+  }
+
+  // Work out if we need to allocate a few extra bytes for alignment.
+  // aom_memalign() will ensure that the start of the allocation is aligned
+  // to a multiple of PYRAMID_ALIGNMENT. But we want the first image pixel
+  // to be aligned, not the first byte of the allocation.
+  //
+  // In the loop below, we ensure that the stride of every image is a multiple
+  // of PYRAMID_ALIGNMENT. Thus the allocated size of each pyramid level will
+  // also be a multiple of PYRAMID_ALIGNMENT. Thus, as long as we can get the
+  // first pixel in the first pyramid layer aligned properly, that will
+  // automatically mean that the first pixel of every row of every layer is
+  // properly aligned too.
+  //
+  // Thus all we need to consider is the first pixel in the first layer.
+  // This is located at offset
+  //   extra_bytes + level_stride * PYRAMID_PADDING + PYRAMID_PADDING
+  // bytes into the buffer. Since level_stride is a multiple of
+  // PYRAMID_ALIGNMENT, we can ignore that. So we need
+  //   extra_bytes + PYRAMID_PADDING = multiple of PYRAMID_ALIGNMENT
+  //
+  // To solve this, we can round PYRAMID_PADDING up to the next multiple
+  // of PYRAMID_ALIGNMENT, then subtract the orginal value to calculate
+  // how many extra bytes are needed.
+  size_t first_px_offset =
+      (PYRAMID_PADDING + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+  size_t extra_bytes = first_px_offset - PYRAMID_PADDING;
+  buffer_size += extra_bytes;
+
+  // If the original image is stored in an 8-bit buffer, then we can point the
+  // lowest pyramid level at that buffer rather than allocating a new one.
+  int first_allocated_level = image_is_16bit ? 0 : 1;
+
+  for (int level = first_allocated_level; level < n_levels; level++) {
+    PyramidLayer *layer = &pyr->layers[level];
+
+    int level_width = width >> level;
+    int level_height = height >> level;
+
+    // Allocate padding for each layer
+    int padded_width = level_width + 2 * PYRAMID_PADDING;
+    int padded_height = level_height + 2 * PYRAMID_PADDING;
+
+    // Align the layer stride to be a multiple of PYRAMID_ALIGNMENT
+    // This ensures that, as long as the top-left pixel in this pyramid level is
+    // properly aligned, then so will the leftmost pixel in every row of the
+    // pyramid level.
+    int level_stride =
+        (padded_width + PYRAMID_ALIGNMENT - 1) & ~(PYRAMID_ALIGNMENT - 1);
+
+    size_t level_alloc_start = buffer_size;
+    size_t level_start =
+        level_alloc_start + PYRAMID_PADDING * level_stride + PYRAMID_PADDING;
+
+    buffer_size += level_stride * padded_height;
+
+    layer_offsets[level] = level_start;
+    layer->width = level_width;
+    layer->height = level_height;
+    layer->stride = level_stride;
+  }
+
+  pyr->buffer_alloc =
+      aom_memalign(PYRAMID_ALIGNMENT, buffer_size * sizeof(*pyr->buffer_alloc));
+  if (!pyr->buffer_alloc) {
+    aom_free(pyr->layers);
+    aom_free(pyr);
+    aom_free(layer_offsets);
+    return NULL;
+  }
+
+  // Fill in pointers for each level
+  // If image is 8-bit, then the lowest level is left unconfigured for now,
+  // and will be set up properly when the pyramid is filled in
+  for (int level = first_allocated_level; level < n_levels; level++) {
+    PyramidLayer *layer = &pyr->layers[level];
+    layer->buffer = pyr->buffer_alloc + layer_offsets[level];
+  }
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_init(&pyr->mutex, NULL);
+#endif  // CONFIG_MULTITHREAD
+
+  aom_free(layer_offsets);
+  return pyr;
+}
+
+// Fill the border region of a pyramid frame.
+// This must be called after the main image area is filled out.
+// `img_buf` should point to the first pixel in the image area,
+// ie. it should be pyr->level_buffer + pyr->level_loc[level].
+static INLINE void fill_border(uint8_t *img_buf, const int width,
+                               const int height, const int stride) {
+  // Fill left and right areas
+  for (int row = 0; row < height; row++) {
+    uint8_t *row_start = &img_buf[row * stride];
+    uint8_t left_pixel = row_start[0];
+    memset(row_start - PYRAMID_PADDING, left_pixel, PYRAMID_PADDING);
+    uint8_t right_pixel = row_start[width - 1];
+    memset(row_start + width, right_pixel, PYRAMID_PADDING);
+  }
+
+  // Fill top area
+  for (int row = -PYRAMID_PADDING; row < 0; row++) {
+    uint8_t *row_start = &img_buf[row * stride];
+    memcpy(row_start - PYRAMID_PADDING, img_buf - PYRAMID_PADDING,
+           width + 2 * PYRAMID_PADDING);
+  }
+
+  // Fill bottom area
+  uint8_t *last_row_start = &img_buf[(height - 1) * stride];
+  for (int row = height; row < height + PYRAMID_PADDING; row++) {
+    uint8_t *row_start = &img_buf[row * stride];
+    memcpy(row_start - PYRAMID_PADDING, last_row_start - PYRAMID_PADDING,
+           width + 2 * PYRAMID_PADDING);
+  }
+}
+
+// Compute coarse to fine pyramids for a frame
+// This must only be called while holding frame_pyr->mutex
+static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+                                ImagePyramid *frame_pyr) {
+  int n_levels = frame_pyr->n_levels;
+  const int frame_width = frame->y_crop_width;
+  const int frame_height = frame->y_crop_height;
+  const int frame_stride = frame->y_stride;
+  assert((frame_width >> n_levels) >= 0);
+  assert((frame_height >> n_levels) >= 0);
+
+  PyramidLayer *first_layer = &frame_pyr->layers[0];
+  if (frame->flags & YV12_FLAG_HIGHBITDEPTH) {
+    // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits
+    assert(first_layer->width == frame_width);
+    assert(first_layer->height == frame_height);
+
+    uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer);
+    uint8_t *pyr_buffer = first_layer->buffer;
+    int pyr_stride = first_layer->stride;
+    for (int y = 0; y < frame_height; y++) {
+      uint16_t *frame_row = frame_buffer + y * frame_stride;
+      uint8_t *pyr_row = pyr_buffer + y * pyr_stride;
+      for (int x = 0; x < frame_width; x++) {
+        pyr_row[x] = frame_row[x] >> (bit_depth - 8);
+      }
+    }
+
+    fill_border(pyr_buffer, frame_width, frame_height, pyr_stride);
+  } else {
+    // For frames stored in an 8-bit buffer, we need to configure the first
+    // pyramid layer to point at the original image buffer
+    first_layer->buffer = frame->y_buffer;
+    first_layer->width = frame_width;
+    first_layer->height = frame_height;
+    first_layer->stride = frame_stride;
+  }
+
+  // Fill in the remaining levels through progressive downsampling
+  for (int level = 1; level < n_levels; ++level) {
+    PyramidLayer *prev_layer = &frame_pyr->layers[level - 1];
+    uint8_t *prev_buffer = prev_layer->buffer;
+    int prev_stride = prev_layer->stride;
+
+    PyramidLayer *this_layer = &frame_pyr->layers[level];
+    uint8_t *this_buffer = this_layer->buffer;
+    int this_width = this_layer->width;
+    int this_height = this_layer->height;
+    int this_stride = this_layer->stride;
+
+    // Compute the this pyramid level by downsampling the current level.
+    //
+    // We downsample by a factor of exactly 2, clipping the rightmost and
+    // bottommost pixel off of the current level if needed. We do this for
+    // two main reasons:
+    //
+    // 1) In the disflow code, when stepping from a higher pyramid level to a
+    //    lower pyramid level, we need to not just interpolate the flow field
+    //    but also to scale each flow vector by the upsampling ratio.
+    //    So it is much more convenient if this ratio is simply 2.
+    //
+    // 2) Up/downsampling by a factor of 2 can be implemented much more
+    //    efficiently than up/downsampling by a generic ratio.
+    //    TODO(rachelbarker): Use optimized downsample-by-2 function
+    if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1,
+                          prev_stride, this_buffer, this_height, this_width,
+                          this_stride))
+      return false;
+    fill_border(this_buffer, this_width, this_height, this_stride);
+  }
+  return true;
+}
+
+// Fill out a downsampling pyramid for a given frame.
+//
+// The top level (index 0) will always be an 8-bit copy of the input frame,
+// regardless of the input bit depth. Additional levels are then downscaled
+// by powers of 2.
+//
+// For small input frames, the number of levels actually constructed
+// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE
+// pixels along each side.
+//
+// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
+// we will still construct the top level.
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+                         ImagePyramid *pyr) {
+  assert(pyr);
+
+  // Per the comments in the ImagePyramid struct, we must take this mutex
+  // before reading or writing the "valid" flag, and hold it while computing
+  // the pyramid, to ensure proper behaviour if multiple threads call this
+  // function simultaneously
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  if (!pyr->valid) {
+    pyr->valid = fill_pyramid(frame, bit_depth, pyr);
+  }
+  bool valid = pyr->valid;
+
+  // At this point, the pyramid is guaranteed to be valid, and can be safely
+  // read from without holding the mutex any more
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+  return valid;
+}
+
+#ifndef NDEBUG
+// Check if a pyramid has already been computed.
+// This is mostly a debug helper - as it is necessary to hold pyr->mutex
+// while reading the valid flag, we cannot just write:
+//   assert(pyr->valid);
+// This function allows the check to be correctly written as:
+//   assert(aom_is_pyramid_valid(pyr));
+bool aom_is_pyramid_valid(ImagePyramid *pyr) {
+  assert(pyr);
+
+  // Per the comments in the ImagePyramid struct, we must take this mutex
+  // before reading or writing the "valid" flag, and hold it while computing
+  // the pyramid, to ensure proper behaviour if multiple threads call this
+  // function simultaneously
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  bool valid = pyr->valid;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+
+  return valid;
+}
+#endif
+
+// Mark a pyramid as no longer containing valid data.
+// This must be done whenever the corresponding frame buffer is reused
+void aom_invalidate_pyramid(ImagePyramid *pyr) {
+  if (pyr) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_lock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+    pyr->valid = false;
+#if CONFIG_MULTITHREAD
+    pthread_mutex_unlock(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+  }
+}
+
+// Release the memory associated with a pyramid
+void aom_free_pyramid(ImagePyramid *pyr) {
+  if (pyr) {
+#if CONFIG_MULTITHREAD
+    pthread_mutex_destroy(&pyr->mutex);
+#endif  // CONFIG_MULTITHREAD
+    aom_free(pyr->buffer_alloc);
+    aom_free(pyr->layers);
+    aom_free(pyr);
+  }
+}
diff --git a/third_party/aom/aom_dsp/pyramid.h b/third_party/aom/aom_dsp/pyramid.h
new file mode 100644
index 0000000000..9442a1ff08
--- /dev/null
+++ b/third_party/aom/aom_dsp/pyramid.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_PYRAMID_H_
+#define AOM_AOM_DSP_PYRAMID_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "config/aom_config.h"
+
+#include "aom_scale/yv12config.h"
+#include "aom_util/aom_thread.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Minimum dimensions of a downsampled image
+#define MIN_PYRAMID_SIZE_LOG2 3
+#define MIN_PYRAMID_SIZE (1 << MIN_PYRAMID_SIZE_LOG2)
+
+// Size of border around each pyramid image, in pixels
+// Similarly to the border around regular image buffers, this border is filled
+// with copies of the outermost pixels of the frame, to allow for more efficient
+// convolution code
+// TODO(rachelbarker): How many pixels do we actually need here?
+// I think we only need 9 for disflow, but how many for corner matching?
+#define PYRAMID_PADDING 16
+
+// Byte alignment of each line within the image pyramids.
+// That is, the first pixel inside the image (ie, not in the border region),
+// on each row of each pyramid level, is aligned to this byte alignment.
+// This value must be a power of 2.
+#define PYRAMID_ALIGNMENT 32
+
+typedef struct {
+  uint8_t *buffer;
+  int width;
+  int height;
+  int stride;
+} PyramidLayer;
+
+// Struct for an image pyramid
+typedef struct image_pyramid {
+#if CONFIG_MULTITHREAD
+  // Mutex which is used to prevent the pyramid being computed twice at the
+  // same time
+  //
+  // Semantics:
+  // * This mutex must be held whenever reading or writing the `valid` flag
+  //
+  // * This mutex must also be held while computing the image pyramid,
+  //   to ensure that only one thread may do so at a time.
+  //
+  // * However, once you have read the valid flag and seen a true value,
+  //   it is safe to drop the mutex and read from the remaining fields.
+  //   This is because, once the image pyramid is computed, its contents
+  //   will not be changed until the parent frame buffer is recycled,
+  //   which will not happen until there are no more outstanding references
+  //   to the frame buffer.
+  pthread_mutex_t mutex;
+#endif
+  // Flag indicating whether the pyramid contains valid data
+  bool valid;
+  // Number of allocated/filled levels in this pyramid
+  int n_levels;
+  // Pointer to allocated buffer
+  uint8_t *buffer_alloc;
+  // Data for each level
+  // The `buffer` pointers inside this array point into the region which
+  // is stored in the `buffer_alloc` field here
+  PyramidLayer *layers;
+} ImagePyramid;
+
+size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels,
+                                  bool image_is_16bit);
+
+ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels,
+                                bool image_is_16bit);
+
+// Fill out a downsampling pyramid for a given frame.
+//
+// The top level (index 0) will always be an 8-bit copy of the input frame,
+// regardless of the input bit depth. Additional levels are then downscaled
+// by powers of 2.
+//
+// For small input frames, the number of levels actually constructed
+// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE
+// pixels along each side.
+//
+// However, if the input frame has a side of length < MIN_PYRAMID_SIZE,
+// we will still construct the top level.
+bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth,
+                         ImagePyramid *pyr);
+
+#ifndef NDEBUG
+// Check if a pyramid has already been computed.
+// This is mostly a debug helper - as it is necessary to hold pyr->mutex
+// while reading the valid flag, we cannot just write:
+//   assert(pyr->valid);
+// This function allows the check to be correctly written as:
+//   assert(aom_is_pyramid_valid(pyr));
+bool aom_is_pyramid_valid(ImagePyramid *pyr);
+#endif
+
+// Mark a pyramid as no longer containing valid data.
+// This must be done whenever the corresponding frame buffer is reused
+void aom_invalidate_pyramid(ImagePyramid *pyr);
+
+// Release the memory associated with a pyramid
+void aom_free_pyramid(ImagePyramid *pyr);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_PYRAMID_H_
diff --git a/third_party/aom/aom_dsp/quantize.c b/third_party/aom/aom_dsp/quantize.c
new file mode 100644
index 0000000000..e5c960b826
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.c
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/quantize.h"
+#include "aom_mem/aom_mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+#if !CONFIG_REALTIME_ONLY
+void aom_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) {
+        eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+        if (first == -1) first = i;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+      }
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  (void)iscan;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS)) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    int tmp32;
+
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      int64_t tmp =
+          clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale),
+                INT16_MIN, INT16_MAX);
+      tmp *= wt;
+      tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+                     quant_shift_ptr[rc != 0]) >>
+                    (16 - log_scale + AOM_QM_BITS));  // quantization
+      qcoeff_ptr[rc] = (tmp32 ^ coeff_sign) - coeff_sign;
+      const int iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (tmp32 * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+
+      if (tmp32) eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+#if !CONFIG_REALTIME_ONLY
+void aom_highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  (void)iscan;
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  int prescan_add[2];
+  for (i = 0; i < 2; ++i)
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+  // Pre-scan pass
+  for (i = (int)n_coeffs - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+    const int prescan_add_val = prescan_add[rc != 0];
+    if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+        coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val))
+      non_zero_count--;
+    else
+      break;
+  }
+
+  // Quantization pass: All coefficients with index >= zero_flag are
+  // skippable. Note: zero_flag can be zero.
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  for (i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    if (abs_coeff * wt >= (zbins[rc != 0] << AOM_QM_BITS)) {
+      const int64_t tmp1 =
+          abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+      const int64_t tmpw = tmp1 * wt;
+      const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+      const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                   (16 - log_scale + AOM_QM_BITS));
+      qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+      const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int dequant =
+          (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >>
+          AOM_QM_BITS;
+      const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+      dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+      if (abs_qcoeff) {
+        eob = i;
+#if SKIP_EOB_FACTOR_ADJUST
+        if (first == -1) first = eob;
+#endif  // SKIP_EOB_FACTOR_ADJUST
+      }
+    }
+  }
+#if SKIP_EOB_FACTOR_ADJUST
+  if (eob >= 0 && first == eob) {
+    const int rc = scan[eob];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+      const int coeff = coeff_ptr[rc] * wt;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val) &&
+          coeff > (nzbins[rc != 0] * (1 << AOM_QM_BITS) - prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        eob = -1;
+      }
+    }
+  }
+#endif  // SKIP_EOB_FACTOR_ADJUST
+  *eob_ptr = eob + 1;
+}
+#endif  // !CONFIG_REALTIME_ONLY
+
+void aom_highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale) {
+  int i, eob = -1;
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
+  int dequant;
+  int idx_arr[4096];
+  (void)iscan;
+  int idx = 0;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs; i++) {
+    const int rc = scan[i];
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int coeff = coeff_ptr[rc] * wt;
+
+    // If the coefficient is out of the base ZBIN range, keep it for
+    // quantization.
+    if (coeff >= (zbins[rc != 0] * (1 << AOM_QM_BITS)) ||
+        coeff <= (nzbins[rc != 0] * (1 << AOM_QM_BITS)))
+      idx_arr[idx++] = i;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = scan[idx_arr[i]];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const qm_val_t wt = qm_ptr != NULL ? qm_ptr[rc] : (1 << AOM_QM_BITS);
+    const qm_val_t iwt = iqm_ptr != NULL ? iqm_ptr[rc] : (1 << AOM_QM_BITS);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 =
+        abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], log_scale);
+    const int64_t tmpw = tmp1 * wt;
+    const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+    const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+                                 (16 - log_scale + AOM_QM_BITS));
+    qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dequant =
+        (dequant_ptr[rc != 0] * iwt + (1 << (AOM_QM_BITS - 1))) >> AOM_QM_BITS;
+    const tran_low_t abs_dqcoeff = (abs_qcoeff * dequant) >> log_scale;
+    dqcoeff_ptr[rc] = (tran_low_t)((abs_dqcoeff ^ coeff_sign) - coeff_sign);
+    if (abs_qcoeff) eob = idx_arr[i];
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+/* These functions should only be called when quantisation matrices
+   are not used. */
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                   quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                   dqcoeff_ptr, dequant_ptr, eob_ptr, scan,
+                                   iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_adaptive_helper_c(coeff_ptr, n_coeffs, zbin_ptr,
+                                          round_ptr, quant_ptr, quant_shift_ptr,
+                                          qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                                          eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 0);
+}
+
+void aom_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 1);
+}
+
+void aom_quantize_b_64x64_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
+                            const int16_t *quant_shift_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  aom_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr, quant_ptr,
+                          quant_shift_ptr, qcoeff_ptr, dqcoeff_ptr, dequant_ptr,
+                          eob_ptr, scan, iscan, NULL, NULL, 2);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 0);
+}
+
+void aom_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 1);
+}
+
+void aom_highbd_quantize_b_64x64_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  aom_highbd_quantize_b_helper_c(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                                 quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                                 dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
+                                 NULL, NULL, 2);
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/quantize.h b/third_party/aom/aom_dsp/quantize.h
new file mode 100644
index 0000000000..efe253ddb9
--- /dev/null
+++ b/third_party/aom/aom_dsp/quantize.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_QUANTIZE_H_
+#define AOM_AOM_DSP_QUANTIZE_H_
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
+void aom_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_quantize_b_adaptive_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_adaptive_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_32x32_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+
+void aom_highbd_quantize_b_64x64_adaptive_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_quantize_b_helper_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan,
+                             const qm_val_t *qm_ptr, const qm_val_t *iqm_ptr,
+                             const int log_scale);
+
+void aom_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_quantize_b_helper_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan, const qm_val_t *qm_ptr,
+    const qm_val_t *iqm_ptr, const int log_scale);
+
+void aom_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_QUANTIZE_H_
diff --git a/third_party/aom/aom_dsp/recenter.h b/third_party/aom/aom_dsp/recenter.h
new file mode 100644
index 0000000000..b3fd412907
--- /dev/null
+++ b/third_party/aom/aom_dsp/recenter.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECENTER_H_
+#define AOM_AOM_DSP_RECENTER_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+// Inverse recenters a non-negative literal v around a reference r
+static INLINE uint16_t inv_recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if ((v & 1) == 0)
+    return (v >> 1) + r;
+  else
+    return r - ((v + 1) >> 1);
+}
+
+// Inverse recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t inv_recenter_finite_nonneg(uint16_t n, uint16_t r,
+                                                  uint16_t v) {
+  if ((r << 1) <= n) {
+    return inv_recenter_nonneg(r, v);
+  } else {
+    return n - 1 - inv_recenter_nonneg(n - 1 - r, v);
+  }
+}
+
+// Recenters a non-negative literal v around a reference r
+static INLINE uint16_t recenter_nonneg(uint16_t r, uint16_t v) {
+  if (v > (r << 1))
+    return v;
+  else if (v >= r)
+    return ((v - r) << 1);
+  else
+    return ((r - v) << 1) - 1;
+}
+
+// Recenters a non-negative literal v in [0, n-1] around a
+// reference r also in [0, n-1]
+static INLINE uint16_t recenter_finite_nonneg(uint16_t n, uint16_t r,
+                                              uint16_t v) {
+  if ((r << 1) <= n) {
+    return recenter_nonneg(r, v);
+  } else {
+    return recenter_nonneg(n - 1 - r, n - 1 - v);
+  }
+}
+
+#endif  // AOM_AOM_DSP_RECENTER_H_
diff --git a/third_party/aom/aom_dsp/rect.h b/third_party/aom/aom_dsp/rect.h
new file mode 100644
index 0000000000..11bdaca979
--- /dev/null
+++ b/third_party/aom/aom_dsp/rect.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_RECT_H_
+#define AOM_AOM_DSP_RECT_H_
+
+#include "config/aom_config.h"
+
+#include <stdbool.h>
+
+// Struct representing a rectangle of pixels.
+// The axes are inclusive-exclusive, ie. the point (top, left) is included
+// in the rectangle but (bottom, right) is not.
+typedef struct {
+  int left, right, top, bottom;
+} PixelRect;
+
+static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; }
+
+static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; }
+
+static INLINE bool is_inside_rect(const int x, const int y,
+                                  const PixelRect *r) {
+  return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom);
+}
+
+#endif  // AOM_AOM_DSP_RECT_H_
diff --git a/third_party/aom/aom_dsp/sad.c b/third_party/aom/aom_dsp/sad.c
new file mode 100644
index 0000000000..8d69e3bf1c
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+/* Sum the difference between every corresponding element of the buffers. */
+static INLINE unsigned int sad(const uint8_t *a, int a_stride, const uint8_t *b,
+                               int b_stride, int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define SADMXN(m, n)                                                          \
+  unsigned int aom_sad##m##x##n##_c(const uint8_t *src, int src_stride,       \
+                                    const uint8_t *ref, int ref_stride) {     \
+    return sad(src, src_stride, ref, ref_stride, m, n);                       \
+  }                                                                           \
+  unsigned int aom_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride,   \
+                                        const uint8_t *ref, int ref_stride,   \
+                                        const uint8_t *second_pred) {         \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride);         \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_c(                             \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_dist_wtd_comp_avg_pred_c(comp_pred, second_pred, m, n, ref,           \
+                                 ref_stride, jcp_param);                      \
+    return sad(src, src_stride, comp_pred, m, m, n);                          \
+  }                                                                           \
+  unsigned int aom_sad_skip_##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref,                 \
+                                          int ref_stride) {                   \
+    return 2 * sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2));   \
+  }
+
+// Calculate sad against 4 reference locations and store each in sad_array
+#define SAD_MXNX4D(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] =                                                          \
+          aom_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride);    \
+    }                                                                         \
+  }                                                                           \
+  void aom_sad_skip_##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref_array[4],       \
+                                     int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                    \
+    for (i = 0; i < 4; ++i) {                                                 \
+      sad_array[i] = 2 * sad(src, 2 * src_stride, ref_array[i],               \
+                             2 * ref_stride, (m), (n / 2));                   \
+    }                                                                         \
+  }
+// Call SIMD version of aom_sad_mxnx4d if the 3d version is unavailable.
+#define SAD_MXNX3D(m, n)                                                      \
+  void aom_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,            \
+                               const uint8_t *const ref_array[4],             \
+                               int ref_stride, uint32_t sad_array[4]) {       \
+    aom_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride, sad_array); \
+  }
+
+// 128x128
+SADMXN(128, 128)
+SAD_MXNX4D(128, 128)
+SAD_MXNX3D(128, 128)
+
+// 128x64
+SADMXN(128, 64)
+SAD_MXNX4D(128, 64)
+SAD_MXNX3D(128, 64)
+
+// 64x128
+SADMXN(64, 128)
+SAD_MXNX4D(64, 128)
+SAD_MXNX3D(64, 128)
+
+// 64x64
+SADMXN(64, 64)
+SAD_MXNX4D(64, 64)
+SAD_MXNX3D(64, 64)
+
+// 64x32
+SADMXN(64, 32)
+SAD_MXNX4D(64, 32)
+SAD_MXNX3D(64, 32)
+
+// 32x64
+SADMXN(32, 64)
+SAD_MXNX4D(32, 64)
+SAD_MXNX3D(32, 64)
+
+// 32x32
+SADMXN(32, 32)
+SAD_MXNX4D(32, 32)
+SAD_MXNX3D(32, 32)
+
+// 32x16
+SADMXN(32, 16)
+SAD_MXNX4D(32, 16)
+SAD_MXNX3D(32, 16)
+
+// 16x32
+SADMXN(16, 32)
+SAD_MXNX4D(16, 32)
+SAD_MXNX3D(16, 32)
+
+// 16x16
+SADMXN(16, 16)
+SAD_MXNX4D(16, 16)
+SAD_MXNX3D(16, 16)
+
+// 16x8
+SADMXN(16, 8)
+SAD_MXNX4D(16, 8)
+SAD_MXNX3D(16, 8)
+
+// 8x16
+SADMXN(8, 16)
+SAD_MXNX4D(8, 16)
+SAD_MXNX3D(8, 16)
+
+// 8x8
+SADMXN(8, 8)
+SAD_MXNX4D(8, 8)
+SAD_MXNX3D(8, 8)
+
+// 8x4
+SADMXN(8, 4)
+SAD_MXNX4D(8, 4)
+SAD_MXNX3D(8, 4)
+
+// 4x8
+SADMXN(4, 8)
+SAD_MXNX4D(4, 8)
+SAD_MXNX3D(4, 8)
+
+// 4x4
+SADMXN(4, 4)
+SAD_MXNX4D(4, 4)
+SAD_MXNX3D(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+SADMXN(4, 16)
+SAD_MXNX4D(4, 16)
+SADMXN(16, 4)
+SAD_MXNX4D(16, 4)
+SADMXN(8, 32)
+SAD_MXNX4D(8, 32)
+SADMXN(32, 8)
+SAD_MXNX4D(32, 8)
+SADMXN(16, 64)
+SAD_MXNX4D(16, 64)
+SADMXN(64, 16)
+SAD_MXNX4D(64, 16)
+SAD_MXNX3D(4, 16)
+SAD_MXNX3D(16, 4)
+SAD_MXNX3D(8, 32)
+SAD_MXNX3D(32, 8)
+SAD_MXNX3D(16, 64)
+SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride,
+                                      const uint8_t *b8, int b_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride,
+                                       const uint8_t *b8, int b_stride,
+                                       int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      sad += abs(a[x] - b[x]);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define HIGHBD_SADMXN(m, n)                                                    \
+  unsigned int aom_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref,                 \
+                                           int ref_stride) {                   \
+    return highbd_sad(src, src_stride, ref, ref_stride, m, n);                 \
+  }                                                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avg_c(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred) {                                            \
+    uint16_t comp_pred[m * n];                                                 \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_comp_avg_pred(comp_pred8, second_pred, m, n, ref, ref_stride);  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }                                                                            \
+  unsigned int aom_highbd_dist_wtd_sad##m##x##n##_avg_c(                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    uint16_t comp_pred[m * n];                                                 \
+    uint8_t *const comp_pred8 = CONVERT_TO_BYTEPTR(comp_pred);                 \
+    aom_highbd_dist_wtd_comp_avg_pred(comp_pred8, second_pred, m, n, ref,      \
+                                      ref_stride, jcp_param);                  \
+    return highbd_sadb(src, src_stride, comp_pred8, m, m, n);                  \
+  }                                                                            \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_c(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                  \
+      int ref_stride) {                                                        \
+    return 2 *                                                                 \
+           highbd_sad(src, 2 * src_stride, ref, 2 * ref_stride, (m), (n / 2)); \
+  }
+
+#define HIGHBD_SAD_MXNX4D(m, n)                                                \
+  void aom_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = aom_highbd_sad##m##x##n##_c(src, src_stride,              \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
+  }                                                                            \
+  void aom_highbd_sad_skip_##m##x##n##x4d_c(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],   \
+      int ref_stride, uint32_t sad_array[4]) {                                 \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = 2 * highbd_sad(src, 2 * src_stride, ref_array[i],         \
+                                    2 * ref_stride, (m), (n / 2));             \
+    }                                                                          \
+  }
+// Call SIMD version of aom_highbd_sad_mxnx4d if the 3d version is unavailable.
+#define HIGHBD_SAD_MXNX3D(m, n)                                                \
+  void aom_highbd_sad##m##x##n##x3d_c(const uint8_t *src, int src_stride,      \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    aom_highbd_sad##m##x##n##x4d(src, src_stride, ref_array, ref_stride,       \
+                                 sad_array);                                   \
+  }
+
+// 128x128
+HIGHBD_SADMXN(128, 128)
+HIGHBD_SAD_MXNX4D(128, 128)
+HIGHBD_SAD_MXNX3D(128, 128)
+
+// 128x64
+HIGHBD_SADMXN(128, 64)
+HIGHBD_SAD_MXNX4D(128, 64)
+HIGHBD_SAD_MXNX3D(128, 64)
+
+// 64x128
+HIGHBD_SADMXN(64, 128)
+HIGHBD_SAD_MXNX4D(64, 128)
+HIGHBD_SAD_MXNX3D(64, 128)
+
+// 64x64
+HIGHBD_SADMXN(64, 64)
+HIGHBD_SAD_MXNX4D(64, 64)
+HIGHBD_SAD_MXNX3D(64, 64)
+
+// 64x32
+HIGHBD_SADMXN(64, 32)
+HIGHBD_SAD_MXNX4D(64, 32)
+HIGHBD_SAD_MXNX3D(64, 32)
+
+// 32x64
+HIGHBD_SADMXN(32, 64)
+HIGHBD_SAD_MXNX4D(32, 64)
+HIGHBD_SAD_MXNX3D(32, 64)
+
+// 32x32
+HIGHBD_SADMXN(32, 32)
+HIGHBD_SAD_MXNX4D(32, 32)
+HIGHBD_SAD_MXNX3D(32, 32)
+
+// 32x16
+HIGHBD_SADMXN(32, 16)
+HIGHBD_SAD_MXNX4D(32, 16)
+HIGHBD_SAD_MXNX3D(32, 16)
+
+// 16x32
+HIGHBD_SADMXN(16, 32)
+HIGHBD_SAD_MXNX4D(16, 32)
+HIGHBD_SAD_MXNX3D(16, 32)
+
+// 16x16
+HIGHBD_SADMXN(16, 16)
+HIGHBD_SAD_MXNX4D(16, 16)
+HIGHBD_SAD_MXNX3D(16, 16)
+
+// 16x8
+HIGHBD_SADMXN(16, 8)
+HIGHBD_SAD_MXNX4D(16, 8)
+HIGHBD_SAD_MXNX3D(16, 8)
+
+// 8x16
+HIGHBD_SADMXN(8, 16)
+HIGHBD_SAD_MXNX4D(8, 16)
+HIGHBD_SAD_MXNX3D(8, 16)
+
+// 8x8
+HIGHBD_SADMXN(8, 8)
+HIGHBD_SAD_MXNX4D(8, 8)
+HIGHBD_SAD_MXNX3D(8, 8)
+
+// 8x4
+HIGHBD_SADMXN(8, 4)
+HIGHBD_SAD_MXNX4D(8, 4)
+HIGHBD_SAD_MXNX3D(8, 4)
+
+// 4x8
+HIGHBD_SADMXN(4, 8)
+HIGHBD_SAD_MXNX4D(4, 8)
+HIGHBD_SAD_MXNX3D(4, 8)
+
+// 4x4
+HIGHBD_SADMXN(4, 4)
+HIGHBD_SAD_MXNX4D(4, 4)
+HIGHBD_SAD_MXNX3D(4, 4)
+
+HIGHBD_SADMXN(4, 16)
+HIGHBD_SAD_MXNX4D(4, 16)
+HIGHBD_SADMXN(16, 4)
+HIGHBD_SAD_MXNX4D(16, 4)
+HIGHBD_SADMXN(8, 32)
+HIGHBD_SAD_MXNX4D(8, 32)
+HIGHBD_SADMXN(32, 8)
+HIGHBD_SAD_MXNX4D(32, 8)
+HIGHBD_SADMXN(16, 64)
+HIGHBD_SAD_MXNX4D(16, 64)
+HIGHBD_SADMXN(64, 16)
+HIGHBD_SAD_MXNX4D(64, 16)
+
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_SAD_MXNX3D(4, 16)
+HIGHBD_SAD_MXNX3D(16, 4)
+HIGHBD_SAD_MXNX3D(8, 32)
+HIGHBD_SAD_MXNX3D(32, 8)
+HIGHBD_SAD_MXNX3D(16, 64)
+HIGHBD_SAD_MXNX3D(64, 16)
+#endif  // !CONFIG_REALTIME_ONLY
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/sad_av1.c b/third_party/aom/aom_dsp/sad_av1.c
new file mode 100644
index 0000000000..f3d5847bd5
--- /dev/null
+++ b/third_party/aom/aom_dsp/sad_av1.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/blend.h"
+
+static INLINE unsigned int masked_sad(const uint8_t *src, int src_stride,
+                                      const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride, int width,
+                                      int height) {
+  int y, x;
+  unsigned int sad = 0;
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  return sad;
+}
+
+#define MASKSADMxN(m, n)                                                       \
+  unsigned int aom_masked_sad##m##x##n##_c(                                    \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,  \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,          \
+      int invert_mask) {                                                       \
+    if (!invert_mask)                                                          \
+      return masked_sad(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n);                                     \
+    else                                                                       \
+      return masked_sad(src, src_stride, second_pred, m, ref, ref_stride, msk, \
+                        msk_stride, m, n);                                     \
+  }                                                                            \
+  void aom_masked_sad##m##x##n##x4d_c(                                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int invert_mask, unsigned sads[4]) {                     \
+    if (!invert_mask)                                                          \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, ref[i], ref_stride, second_pred, \
+                             m, msk, msk_stride, m, n);                        \
+      }                                                                        \
+    else                                                                       \
+      for (int i = 0; i < 4; i++) {                                            \
+        sads[i] = masked_sad(src, src_stride, second_pred, m, ref[i],          \
+                             ref_stride, msk, msk_stride, m, n);               \
+      }                                                                        \
+  }
+
+/* clang-format off */
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+MASKSADMxN(4, 16)
+MASKSADMxN(16, 4)
+MASKSADMxN(8, 32)
+MASKSADMxN(32, 8)
+MASKSADMxN(16, 64)
+MASKSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
+    unsigned int highbd_masked_sad(const uint8_t *src8, int src_stride,
+                                   const uint8_t *a8, int a_stride,
+                                   const uint8_t *b8, int b_stride,
+                                   const uint8_t *m, int m_stride, int width,
+                                   int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const uint16_t pred = AOM_BLEND_A64(m[x], a[x], b[x]);
+      sad += abs(pred - src[x]);
+    }
+
+    src += src_stride;
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_c(                      \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,         \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,  \
+      int msk_stride, int invert_mask) {                                \
+    if (!invert_mask)                                                   \
+      return highbd_masked_sad(src8, src_stride, ref8, ref_stride,      \
+                               second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                \
+      return highbd_masked_sad(src8, src_stride, second_pred8, m, ref8, \
+                               ref_stride, msk, msk_stride, m, n);      \
+  }
+
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+HIGHBD_MASKSADMXN(4, 16)
+HIGHBD_MASKSADMXN(16, 4)
+HIGHBD_MASKSADMXN(8, 32)
+HIGHBD_MASKSADMXN(32, 8)
+HIGHBD_MASKSADMXN(16, 64)
+HIGHBD_MASKSADMXN(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                     \
+  unsigned int aom_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride, \
+                                         const int32_t *wsrc,                \
+                                         const int32_t *mask) {              \
+    return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                      \
+  }
+
+/* clang-format off */
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+OBMCSADMxN(4, 16)
+OBMCSADMxN(16, 4)
+OBMCSADMxN(8, 32)
+OBMCSADMxN(32, 8)
+OBMCSADMxN(16, 64)
+OBMCSADMxN(64, 16)
+/* clang-format on */
+
+#if CONFIG_AV1_HIGHBITDEPTH
+                            static INLINE
+    unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                \
+  unsigned int aom_highbd_obmc_sad##m##x##n##_c(               \
+      const uint8_t *ref, int ref_stride, const int32_t *wsrc, \
+      const int32_t *mask) {                                   \
+    return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n); \
+  }
+
+/* clang-format off */
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+HIGHBD_OBMCSADMXN(4, 16)
+HIGHBD_OBMCSADMXN(16, 4)
+HIGHBD_OBMCSADMXN(8, 32)
+HIGHBD_OBMCSADMXN(32, 8)
+HIGHBD_OBMCSADMXN(16, 64)
+HIGHBD_OBMCSADMXN(64, 16)
+/* clang-format on */
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics.h b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
new file mode 100644
index 0000000000..218a7a6186
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics.h
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v128 v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) { return c_v128_low_u32(a); }
+SIMD_INLINE v64 v128_low_v64(v128 a) { return c_v128_low_v64(a); }
+SIMD_INLINE v64 v128_high_v64(v128 a) { return c_v128_high_v64(a); }
+SIMD_INLINE v128 v128_from_64(uint64_t hi, uint64_t lo) {
+  return c_v128_from_64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_v64(v64 hi, v64 lo) {
+  return c_v128_from_v64(hi, lo);
+}
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return c_v128_from_32(a, b, c, d);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+  return c_v128_load_unaligned(p);
+}
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return c_v128_load_aligned(p);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  c_v128_store_unaligned(p, a);
+}
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  c_v128_store_aligned(p, a);
+}
+
+SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+  return c_v128_align(a, b, c);
+}
+
+SIMD_INLINE v128 v128_zero(void) { return c_v128_zero(); }
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
+
+SIMD_INLINE c_sad128_internal v128_sad_u8_init(void) {
+  return c_v128_sad_u8_init();
+}
+SIMD_INLINE c_sad128_internal v128_sad_u8(c_sad128_internal s, v128 a, v128 b) {
+  return c_v128_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u8_sum(c_sad128_internal s) {
+  return c_v128_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8_init(void) {
+  return c_v128_ssd_u8_init();
+}
+SIMD_INLINE c_ssd128_internal v128_ssd_u8(c_ssd128_internal s, v128 a, v128 b) {
+  return c_v128_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v128_ssd_u8_sum(c_ssd128_internal s) {
+  return c_v128_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  return c_v128_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return c_v128_xor(a, b); }
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return c_v128_and(a, b); }
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return c_v128_andn(a, b); }
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
+SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return c_v128_ssub_s8(a, b); }
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return c_v128_sub_16(a, b); }
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
+SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
+SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { return c_v128_mul_s16(a, b); }
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return c_v128_mullo_s16(a, b);
+}
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return c_v128_mulhi_s16(a, b);
+}
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+  return c_v128_mullo_s32(a, b);
+}
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) { return c_v128_min_s8(a, b); }
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) { return c_v128_ziplo_16(a, b); }
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) { return c_v128_ziphi_16(a, b); }
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) { return c_v128_ziplo_32(a, b); }
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) { return c_v128_ziphi_32(a, b); }
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { return c_v128_ziplo_64(a, b); }
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { return c_v128_ziphi_64(a, b); }
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return c_v128_zip_8(a, b); }
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return c_v128_zip_16(a, b); }
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return c_v128_zip_32(a, b); }
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+  return c_v128_unziplo_8(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return c_v128_unziphi_8(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+  return c_v128_unziplo_16(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return c_v128_unziphi_16(a, b);
+}
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return c_v128_unziplo_32(a, b);
+}
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return c_v128_unziphi_32(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { return c_v128_unpack_u8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return c_v128_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return c_v128_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { return c_v128_unpack_s8_s16(a); }
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return c_v128_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return c_v128_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return c_v128_pack_s32_s16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return c_v128_pack_s16_u8(a, b);
+}
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return c_v128_pack_s16_s8(a, b);
+}
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { return c_v128_unpack_u16_s32(a); }
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { return c_v128_unpack_s16_s32(a); }
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return c_v128_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return c_v128_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return c_v128_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return c_v128_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v128 v128_shuffle_8(v128 a, v128 pattern) {
+  return c_v128_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return c_v128_cmpgt_s8(a, b); }
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return c_v128_cmplt_s8(a, b); }
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return c_v128_cmpeq_8(a, b); }
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return c_v128_cmpgt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return c_v128_cmplt_s16(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return c_v128_shl_8(a, c);
+}
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return c_v128_shr_u8(a, c);
+}
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  return c_v128_shr_s8(a, c);
+}
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return c_v128_shl_16(a, c);
+}
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return c_v128_shr_u16(a, c);
+}
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return c_v128_shr_s16(a, c);
+}
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return c_v128_shl_32(a, c);
+}
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return c_v128_shr_u32(a, c);
+}
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return c_v128_shr_s32(a, c);
+}
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
+  return c_v128_shr_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
+  return c_v128_shl_n_byte(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int n) {
+  return c_v128_shl_n_8(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int n) {
+  return c_v128_shl_n_16(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
+  return c_v128_shl_n_32(a, n);
+}
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
+  return c_v128_shr_n_u8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int n) {
+  return c_v128_shr_n_u16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
+  return c_v128_shr_n_u32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
+  return c_v128_shr_n_s8(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int n) {
+  return c_v128_shr_n_s16(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
+  return c_v128_shr_n_s32(a, n);
+}
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
new file mode 100644
index 0000000000..f5ca817fb6
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_c.h
@@ -0,0 +1,898 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[16];
+  uint16_t u16[8];
+  uint32_t u32[4];
+  uint64_t u64[2];
+  int8_t s8[16];
+  int16_t s16[8];
+  int32_t s32[4];
+  int64_t s64[2];
+  c_v64 v64[2];
+} c_v128;
+
+SIMD_INLINE uint32_t c_v128_low_u32(c_v128 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v128_low_v64(c_v128 a) { return a.v64[0]; }
+
+SIMD_INLINE c_v64 c_v128_high_v64(c_v128 a) { return a.v64[1]; }
+
+SIMD_INLINE c_v128 c_v128_from_64(uint64_t hi, uint64_t lo) {
+  c_v128 t;
+  t.u64[1] = hi;
+  t.u64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_v64(c_v64 hi, c_v64 lo) {
+  c_v128 t;
+  t.v64[1] = hi;
+  t.v64[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_from_32(uint32_t a, uint32_t b, uint32_t c,
+                                  uint32_t d) {
+  c_v128 t;
+  t.u32[3] = a;
+  t.u32[2] = b;
+  t.u32[1] = c;
+  t.u32[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_unaligned(const void *p) {
+  c_v128 t;
+  memcpy(&t, p, 16);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 load at %p\n", p);
+    abort();
+  }
+  return c_v128_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
+  memcpy(p, &a, 16);
+}
+
+SIMD_INLINE void c_v128_store_aligned(void *p, c_v128 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 15) {
+    fprintf(stderr, "Error: unaligned v128 store at %p\n", p);
+    abort();
+  }
+  c_v128_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v128 c_v128_zero(void) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_8(uint8_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_16(uint16_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_32(uint32_t x) {
+  c_v128 t;
+  t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
+  return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
+         c_v64_dotp_s16(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
+SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
+  return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad128_internal;
+
+SIMD_INLINE c_sad128_internal c_v128_sad_u8_init(void) {
+  c_sad128_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u8_sum(). The result for more than 32 v128_sad_u8() calls is
+ * undefined. */
+SIMD_INLINE c_sad128_internal c_v128_sad_u8(c_sad128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u8_sum(c_sad128_internal s) { return s.val; }
+
+typedef uint32_t c_ssd128_internal;
+
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_u8_sum(). */
+SIMD_INLINE c_ssd128_internal c_v128_ssd_u8(c_ssd128_internal s, c_v128 a,
+                                            c_v128 b) {
+  int c;
+  for (c = 0; c < 16; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_ssd_u8_sum(c_ssd128_internal s) { return s; }
+
+SIMD_INLINE c_v128 c_v128_or(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_or(a.v64[1], b.v64[1]),
+                         c_v64_or(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_xor(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_xor(a.v64[1], b.v64[1]),
+                         c_v64_xor(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_and(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_and(a.v64[1], b.v64[1]),
+                         c_v64_and(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_andn(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_andn(a.v64[1], b.v64[1]),
+                         c_v64_andn(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_8(a.v64[1], b.v64[1]),
+                         c_v64_add_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_16(a.v64[1], b.v64[1]),
+                         c_v64_add_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_add_32(a.v64[1], b.v64[1]),
+                         c_v64_add_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
+  c_v128 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
+                         c_v64_sub_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s8(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_16(a.v64[1], b.v64[1]),
+                         c_v64_sub_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_s16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ssub_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ssub_u16(a.v64[1], b.v64[1]),
+                         c_v64_ssub_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sub_32(a.v64[1], b.v64[1]),
+                         c_v64_sub_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_abs_s8(c_v128 a) {
+  return c_v128_from_v64(c_v64_abs_s8(a.v64[1]), c_v64_abs_s8(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mul_s16(c_v64 a, c_v64 b) {
+  c_v64 lo_bits = c_v64_mullo_s16(a, b);
+  c_v64 hi_bits = c_v64_mulhi_s16(a, b);
+  return c_v128_from_v64(c_v64_ziphi_16(hi_bits, lo_bits),
+                         c_v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s16(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mulhi_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mulhi_s16(a.v64[1], b.v64[1]),
+                         c_v64_mulhi_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_mullo_s32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_mullo_s32(a.v64[1], b.v64[1]),
+                         c_v64_mullo_s32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_s16(a.v64[1], b.v64[1]),
+                         c_v64_madd_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_madd_us8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_madd_us8(a.v64[1], b.v64[1]),
+                         c_v64_madd_us8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u8(a.v64[1], b.v64[1]),
+                         c_v64_avg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u8(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
+                         c_v64_avg_u16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_u8(a.v64[1], b.v64[1]),
+                         c_v64_min_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_u8(a.v64[1], b.v64[1]),
+                         c_v64_max_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s8(a.v64[1], b.v64[1]),
+                         c_v64_min_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
+                         c_v64_max_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_min_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_min_s16(a.v64[1], b.v64[1]),
+                         c_v64_min_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_max_s16(a.v64[1], b.v64[1]),
+                         c_v64_max_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_8(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_16(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[0], b.v64[0]),
+                         c_v64_ziplo_32(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_32(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a.v64[1], b.v64[1]),
+                         c_v64_ziplo_32(a.v64[1], b.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_ziplo_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[0], b.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_ziphi_64(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(a.v64[1], b.v64[1]);
+}
+
+SIMD_INLINE c_v128 c_v128_zip_8(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_8(a, b), c_v64_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_16(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_16(a, b), c_v64_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v128 c_v128_zip_32(c_v64 a, c_v64 b) {
+  return c_v128_from_v64(c_v64_ziphi_32(a, b), c_v64_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_8(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u8[15] = b.u8[15];
+    t.u8[14] = b.u8[13];
+    t.u8[13] = b.u8[11];
+    t.u8[12] = b.u8[9];
+    t.u8[11] = b.u8[7];
+    t.u8[10] = b.u8[5];
+    t.u8[9] = b.u8[3];
+    t.u8[8] = b.u8[1];
+    t.u8[7] = a.u8[15];
+    t.u8[6] = a.u8[13];
+    t.u8[5] = a.u8[11];
+    t.u8[4] = a.u8[9];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[15] = a.u8[14];
+    t.u8[14] = a.u8[12];
+    t.u8[13] = a.u8[10];
+    t.u8[12] = a.u8[8];
+    t.u8[11] = a.u8[6];
+    t.u8[10] = a.u8[4];
+    t.u8[9] = a.u8[2];
+    t.u8[8] = a.u8[0];
+    t.u8[7] = b.u8[14];
+    t.u8[6] = b.u8[12];
+    t.u8[5] = b.u8[10];
+    t.u8[4] = b.u8[8];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(a, b, 1)
+                           : _c_v128_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_8(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_8(b, a, 0)
+                           : _c_v128_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_16(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u16[7] = b.u16[7];
+    t.u16[6] = b.u16[5];
+    t.u16[5] = b.u16[3];
+    t.u16[4] = b.u16[1];
+    t.u16[3] = a.u16[7];
+    t.u16[2] = a.u16[5];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[7] = a.u16[6];
+    t.u16[6] = a.u16[4];
+    t.u16[5] = a.u16[2];
+    t.u16[4] = a.u16[0];
+    t.u16[3] = b.u16[6];
+    t.u16[2] = b.u16[4];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(a, b, 1)
+                           : _c_v128_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_16(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_16(b, a, 0)
+                           : _c_v128_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v128 _c_v128_unzip_32(c_v128 a, c_v128 b, int mode) {
+  c_v128 t;
+  if (mode) {
+    t.u32[3] = b.u32[3];
+    t.u32[2] = b.u32[1];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[3] = a.u32[2];
+    t.u32[2] = a.u32[0];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_unziplo_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(a, b, 1)
+                           : _c_v128_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v128 c_v128_unziphi_32(c_v128 a, c_v128 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v128_unzip_32(b, a, 0)
+                           : _c_v128_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a), c_v64_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[0]),
+                         c_v64_unpacklo_u8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u8_s16(a.v64[1]),
+                         c_v64_unpacklo_u8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s8_s16(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a), c_v64_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[0]),
+                         c_v64_unpacklo_s8_s16(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s8_s16(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s8_s16(a.v64[1]),
+                         c_v64_unpacklo_s8_s16(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_s16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_pack_s16_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s16_s8(a.v64[1], a.v64[0]),
+                         c_v64_pack_s16_s8(b.v64[1], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_u16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a), c_v64_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpack_s16_s32(c_v64 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a), c_v64_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[0]),
+                         c_v64_unpacklo_u16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpacklo_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[0]),
+                         c_v64_unpacklo_s16_s32(a.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_u16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_u16_s32(a.v64[1]),
+                         c_v64_unpacklo_u16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_unpackhi_s16_s32(c_v128 a) {
+  return c_v128_from_v64(c_v64_unpackhi_s16_s32(a.v64[1]),
+                         c_v64_unpacklo_s16_s32(a.v64[1]));
+}
+
+SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 16; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
+                                     : pattern.u8[c] & 15];
+
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s8(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_8(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpgt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmpgt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmplt_s16(a.v64[1], b.v64[1]),
+                         c_v64_cmplt_s16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_cmpeq_16(a.v64[1], b.v64[1]),
+                         c_v64_cmpeq_16(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
+  if (n < 8)
+    return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
+                                    c_v64_shr_n_byte(a.v64[0], 8 - n)),
+                           c_v64_shl_n_byte(a.v64[0], n));
+  else
+    return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
+  if (n == 0) return a;
+  if (n < 8)
+    return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
+                           c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
+                                    c_v64_shl_n_byte(a.v64[1], 8 - n)));
+  else
+    return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
+}
+
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
+  if (SIMD_CHECK && c > 15) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v128_or(c_v128_shr_n_byte(b, c), c_v128_shl_n_byte(a, 16 - c))
+           : b;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
+                         c_v64_shr_u16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
+                         c_v64_shr_s16(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
+                         c_v64_shr_u32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
+  return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
+                         c_v64_shr_s32(a.v64[0], c));
+}
+
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
+  return c_v128_shl_8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
+  return c_v128_shl_16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
+  return c_v128_shl_32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
new file mode 100644
index 0000000000..d20f979dd9
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v128_intrinsics_x86.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
+
+#include <stdint.h>
+#include "aom_dsp/simd/v64_intrinsics_x86.h"
+
+typedef __m128i v128;
+
+SIMD_INLINE uint32_t v128_low_u32(v128 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE v64 v128_low_v64(v128 a) {
+  return _mm_unpacklo_epi64(a, v64_zero());
+}
+
+SIMD_INLINE v64 v128_high_v64(v128 a) { return _mm_srli_si128(a, 8); }
+
+SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) {
+  return v128_from_v64(v64_from_64(a), v64_from_64(b));
+}
+
+SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
+  return _mm_set_epi32((int)a, (int)b, (int)c, (int)d);
+}
+
+SIMD_INLINE v128 v128_load_aligned(const void *p) {
+  return _mm_load_si128((__m128i *)p);
+}
+
+SIMD_INLINE v128 v128_load_unaligned(const void *p) {
+#if defined(__SSSE3__)
+  return _mm_lddqu_si128((__m128i *)p);
+#else
+  return _mm_loadu_si128((__m128i *)p);
+#endif
+}
+
+SIMD_INLINE void v128_store_aligned(void *p, v128 a) {
+  _mm_store_si128((__m128i *)p, a);
+}
+
+SIMD_INLINE void v128_store_unaligned(void *p, v128 a) {
+  _mm_storeu_si128((__m128i *)p, a);
+}
+
+// The following function requires an immediate.
+// Some compilers will check this during optimisation, others wont.
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#if defined(__SSSE3__)
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
+  return c ? _mm_alignr_epi8(a, b, c) : b;
+}
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#else
+#if defined(__SSSE3__)
+#define v128_align(a, b, c) ((c) ? _mm_alignr_epi8(a, b, (uint8_t)(c)) : (b))
+#else
+#define v128_align(a, b, c) \
+  ((c) ? _mm_or_si128(_mm_srli_si128(b, c), _mm_slli_si128(a, 16 - (c))) : (b))
+#endif
+#endif
+
+SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); }
+
+SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
+
+SIMD_INLINE v128 v128_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
+
+SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
+
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32((int32_t)(x >> 32), (int32_t)x, (int32_t)(x >> 32),
+                       (int32_t)x);
+}
+
+SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
+SIMD_INLINE v128 v128_padd_s16(v128 a) {
+  return _mm_madd_epi16(a, _mm_set1_epi16(1));
+}
+
+SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s8(v128 a, v128 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v128 v128_sub_16(v128 a, v128 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
+SIMD_INLINE v128 v128_abs_s16(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v128 v128_abs_s8(v128 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v128 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) {
+  return _mm_unpacklo_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) {
+  return _mm_unpackhi_epi8(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_16(v128 a, v128 b) {
+  return _mm_unpacklo_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_16(v128 a, v128 b) {
+  return _mm_unpackhi_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_32(v128 a, v128 b) {
+  return _mm_unpacklo_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_32(v128 a, v128 b) {
+  return _mm_unpackhi_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) {
+  return _mm_unpacklo_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) {
+  return _mm_unpackhi_epi64(b, a);
+}
+
+SIMD_INLINE v128 v128_zip_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v128 v128_zip_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v128 v128_zip_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v128 v128_unziphi_8(v128 a, v128 b) {
+  return _mm_packs_epi16(_mm_srai_epi16(b, 8), _mm_srai_epi16(a, 8));
+}
+
+SIMD_INLINE v128 v128_unziplo_8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0e0c0a0806040200LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0e0c0a08, 0x06040200);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_16(v128 a, v128 b) {
+  return _mm_packs_epi32(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16));
+}
+
+SIMD_INLINE v128 v128_unziplo_16(v128 a, v128 b) {
+#if defined(__SSSE3__)
+#ifdef __x86_64__
+  v128 order = _mm_cvtsi64_si128(0x0d0c090805040100LL);
+#else
+  v128 order = _mm_set_epi32(0, 0, 0x0d0c0908, 0x05040100);
+#endif
+  return _mm_unpacklo_epi64(_mm_shuffle_epi8(b, order),
+                            _mm_shuffle_epi8(a, order));
+#else
+  return v128_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v128 v128_unziphi_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(3, 1, 3, 1)));
+}
+
+SIMD_INLINE v128 v128_unziplo_32(v128 a, v128 b) {
+  return _mm_castps_si128(_mm_shuffle_ps(
+      _mm_castsi128_ps(b), _mm_castsi128_ps(a), _MM_SHUFFLE(2, 0, 2, 0)));
+}
+
+SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) {
+  return _mm_unpackhi_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) {
+  return _mm_srai_epi16(_mm_unpackhi_epi8(a, a), 8);
+}
+
+SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
+  return _mm_packs_epi32(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
+SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
+  return _mm_packus_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) {
+  return _mm_packs_epi16(b, a);
+}
+
+SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) {
+  return _mm_unpackhi_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) {
+  return _mm_srai_epi32(_mm_unpackhi_epi16(a, a), 16);
+}
+
+SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v128 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  unsigned char *selected = (unsigned char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 16; counter++) {
+    selected[counter] = input[index[counter] & 15];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
+  v128 r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  v128 c = _mm_add_epi64(_mm_cvtepi32_epi64(r),
+                         _mm_cvtepi32_epi64(_mm_srli_si128(r, 8)));
+  return _mm_cvtsi128_si64(_mm_add_epi64(c, _mm_srli_si128(c, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v128_hadd_u8(v128 a) {
+  v128 t = _mm_sad_epu8(a, _mm_setzero_si128());
+  return v64_low_u32(v128_low_v64(t)) + v64_low_u32(v128_high_v64(t));
+}
+
+typedef v128 sad128_internal;
+
+SIMD_INLINE sad128_internal v128_sad_u8_init(void) {
+  return _mm_setzero_si128();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v128_sad_sum().
+   The result for more than 32 v128_sad_u8() calls is undefined. */
+SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
+  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
+}
+
+typedef int32_t ssd128_internal;
+
+SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_sum(). */
+SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
+  v128 rl = _mm_madd_epi16(l, l);
+  v128 rh = _mm_madd_epi16(h, h);
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
+}
+
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
+
+SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v128 v128_xor(v128 a, v128 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v128 v128_and(v128 a, v128 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v128 v128_andn(v128 a, v128 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) {
+  v64 lo_bits = v64_mullo_s16(a, b);
+  v64 hi_bits = v64_mulhi_s16(a, b);
+  return v128_from_v64(v64_ziphi_16(hi_bits, lo_bits),
+                       v64_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) {
+  return _mm_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+  return _mm_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_shuffle_epi32(_mm_mul_epu32(a, b), 8),
+      _mm_shuffle_epi32(
+          _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)), 8));
+#endif
+}
+
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
+SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  return _mm_packs_epi32(
+      _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8)),
+      _mm_madd_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
+                     _mm_srai_epi16(_mm_unpackhi_epi8(b, b), 8)));
+#endif
+}
+
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
+SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
+}
+
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
+SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v128 v128_min_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v128 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpeq_8(v128 a, v128 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s16(v128 a, v128 b) {
+  return _mm_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s16(v128 a, v128 b) {
+  return _mm_cmplt_epi16(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
+  return _mm_packs_epi16(_mm_sra_epi16(_mm_unpacklo_epi8(a, a), x),
+                         _mm_sra_epi16(_mm_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((uint64_t)((int64_t)v64_u64(v128_high_v64(a)) >> c),
+                      (uint64_t)((int64_t)v64_u64(v128_low_v64(a)) >> c));
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v128_shl_n_byte(a, c) _mm_slli_si128(a, (c)&127)
+#define v128_shr_n_byte(a, c) _mm_srli_si128(a, (c)&127)
+#define v128_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v128_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
+#define v128_shr_n_s8(a, c)                                         \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), \
+                  _mm_srai_epi16(_mm_unpackhi_epi8(a, a), (c) + 8))
+#define v128_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v128_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v128_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V128_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics.h b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
new file mode 100644
index 0000000000..17e36eed61
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics.h
@@ -0,0 +1,377 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "aom_dsp/simd/v256_intrinsics_c.h"
+#include "aom_dsp/simd/v128_intrinsics.h"
+#include "aom_dsp/simd/v64_intrinsics.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v256 v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
+SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  return c_v256_from_v128(hi, lo);
+}
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return c_v256_from_64(a, b, c, d);
+}
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return c_v256_from_v64(a, b, c, d);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return c_v256_load_unaligned(p);
+}
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return c_v256_load_aligned(p);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  c_v256_store_unaligned(p, a);
+}
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  c_v256_store_aligned(p, a);
+}
+
+SIMD_INLINE v256 v256_align(v256 a, v256 b, unsigned int c) {
+  return c_v256_align(a, b, c);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return c_v256_zero(); }
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
+
+SIMD_INLINE c_sad256_internal v256_sad_u8_init(void) {
+  return c_v256_sad_u8_init();
+}
+SIMD_INLINE c_sad256_internal v256_sad_u8(c_sad256_internal s, v256 a, v256 b) {
+  return c_v256_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u8_sum(c_sad256_internal s) {
+  return c_v256_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8_init(void) {
+  return c_v256_ssd_u8_init();
+}
+SIMD_INLINE c_ssd256_internal v256_ssd_u8(c_ssd256_internal s, v256 a, v256 b) {
+  return c_v256_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v256_ssd_u8_sum(c_ssd256_internal s) {
+  return c_v256_ssd_u8_sum(s);
+}
+
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16_init(void) {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE c_ssd256_internal_s16 v256_ssd_s16(c_ssd256_internal_s16 s, v256 a,
+                                               v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(c_ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return c_v256_dotp_s16(a, b);
+}
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return c_v256_xor(a, b); }
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return c_v256_and(a, b); }
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return c_v256_andn(a, b); }
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
+SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return c_v256_ssub_s8(a, b); }
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return c_v256_sub_16(a, b); }
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) { return c_v256_ssub_s16(a, b); }
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) { return c_v256_ssub_u16(a, b); }
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return c_v256_sub_32(a, b); }
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return c_v256_abs_s16(a); }
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return c_v256_abs_s8(a); }
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) { return c_v256_mul_s16(a, b); }
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return c_v256_mullo_s16(a, b);
+}
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return c_v256_mulhi_s16(a, b);
+}
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return c_v256_mullo_s32(a, b);
+}
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return c_v256_min_s8(a, b); }
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) { return c_v256_ziplo_16(a, b); }
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) { return c_v256_ziphi_16(a, b); }
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) { return c_v256_ziplo_32(a, b); }
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) { return c_v256_ziphi_32(a, b); }
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) { return c_v256_ziplo_64(a, b); }
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) { return c_v256_ziphi_64(a, b); }
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return c_v256_ziplo_128(a, b);
+}
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return c_v256_ziphi_128(a, b);
+}
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) { return c_v256_zip_8(a, b); }
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) { return c_v256_zip_16(a, b); }
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) { return c_v256_zip_32(a, b); }
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return c_v256_unziplo_8(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return c_v256_unziphi_8(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return c_v256_unziplo_16(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return c_v256_unziphi_16(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return c_v256_unziplo_32(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return c_v256_unziphi_32(a, b);
+}
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return c_v256_unpacklo_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return c_v256_unpackhi_u8_s16(a);
+}
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) { return c_v256_unpack_s8_s16(a); }
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return c_v256_unpacklo_s8_s16(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return c_v256_unpackhi_s8_s16(a);
+}
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return c_v256_pack_s32_s16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return c_v256_pack_s16_u8(a, b);
+}
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return c_v256_pack_s16_s8(a, b);
+}
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return c_v256_unpack_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return c_v256_unpack_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return c_v256_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return c_v256_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return c_v256_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return c_v256_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return c_v256_shuffle_8(a, pattern);
+}
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return c_v256_pshuffle_8(a, pattern);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) { return c_v256_cmpgt_s8(a, b); }
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) { return c_v256_cmplt_s8(a, b); }
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) { return c_v256_cmpeq_8(a, b); }
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return c_v256_cmpgt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return c_v256_cmplt_s16(a, b);
+}
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return c_v256_shl_8(a, c);
+}
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return c_v256_shr_u8(a, c);
+}
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  return c_v256_shr_s8(a, c);
+}
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return c_v256_shl_16(a, c);
+}
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return c_v256_shr_u16(a, c);
+}
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return c_v256_shr_s16(a, c);
+}
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return c_v256_shl_32(a, c);
+}
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return c_v256_shr_u32(a, c);
+}
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return c_v256_shr_s32(a, c);
+}
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return c_v256_shl_64(a, c);
+}
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return c_v256_shr_u64(a, c);
+}
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+  return c_v256_shr_s64(a, c);
+}
+
+SIMD_INLINE v256 v256_shr_n_byte(v256 a, unsigned int n) {
+  return c_v256_shr_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_byte(v256 a, unsigned int n) {
+  return c_v256_shl_n_byte(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_8(v256 a, unsigned int n) {
+  return c_v256_shl_n_8(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_16(v256 a, unsigned int n) {
+  return c_v256_shl_n_16(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
+  return c_v256_shl_n_32(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
+  return c_v256_shr_n_u8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u16(v256 a, unsigned int n) {
+  return c_v256_shr_n_u16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
+  return c_v256_shr_n_u32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
+  return c_v256_shr_n_s8(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s16(v256 a, unsigned int n) {
+  return c_v256_shr_n_s16(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
+  return c_v256_shr_n_s32(a, n);
+}
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
new file mode 100644
index 0000000000..60d0d53f6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_c.h
@@ -0,0 +1,963 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom_dsp/simd/v128_intrinsics_c.h"
+
+typedef union {
+  uint8_t u8[32];
+  uint16_t u16[16];
+  uint32_t u32[8];
+  uint64_t u64[4];
+  int8_t s8[32];
+  int16_t s16[16];
+  int32_t s32[8];
+  int64_t s64[4];
+  c_v64 v64[4];
+  c_v128 v128[2];
+} c_v256;
+
+SIMD_INLINE uint32_t c_v256_low_u32(c_v256 a) { return a.u32[0]; }
+
+SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
+
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
+SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
+
+SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
+
+SIMD_INLINE c_v256 c_v256_from_v128(c_v128 hi, c_v128 lo) {
+  c_v256 t;
+  t.v128[1] = hi;
+  t.v128[0] = lo;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_64(uint64_t a, uint64_t b, uint64_t c,
+                                  uint64_t d) {
+  c_v256 t;
+  t.u64[3] = a;
+  t.u64[2] = b;
+  t.u64[1] = c;
+  t.u64[0] = d;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_from_v64(c_v64 a, c_v64 b, c_v64 c, c_v64 d) {
+  c_v256 t;
+  t.u64[3] = a.u64;
+  t.u64[2] = b.u64;
+  t.u64[1] = c.u64;
+  t.u64[0] = d.u64;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_unaligned(const void *p) {
+  c_v256 t;
+  memcpy(&t, p, 32);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 load at %p\n", p);
+    abort();
+  }
+  return c_v256_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v256_store_unaligned(void *p, c_v256 a) {
+  memcpy(p, &a, 32);
+}
+
+SIMD_INLINE void c_v256_store_aligned(void *p, c_v256 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 31) {
+    fprintf(stderr, "Error: unaligned v256 store at %p\n", p);
+    abort();
+  }
+  c_v256_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v256 c_v256_zero(void) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_8(uint8_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_8(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_16(uint16_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_16(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_32(uint32_t x) {
+  c_v256 t;
+  t.v64[3] = t.v64[2] = t.v64[1] = t.v64[0] = c_v64_dup_32(x);
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s16(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
+  return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad256_internal;
+
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init(void) {
+  c_sad256_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u8_sum(c_sad256_internal s) { return s.val; }
+
+typedef uint32_t c_ssd256_internal;
+
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE c_ssd256_internal c_v256_ssd_u8(c_ssd256_internal s, c_v256 a,
+                                            c_v256 b) {
+  int c;
+  for (c = 0; c < 32; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_ssd_u8_sum(c_ssd256_internal s) { return s; }
+
+SIMD_INLINE c_v256 c_v256_or(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_or(a.v128[1], b.v128[1]),
+                          c_v128_or(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_xor(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_xor(a.v128[1], b.v128[1]),
+                          c_v128_xor(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_and(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_and(a.v128[1], b.v128[1]),
+                          c_v128_and(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_andn(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_andn(a.v128[1], b.v128[1]),
+                          c_v128_andn(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_8(a.v128[1], b.v128[1]),
+                          c_v128_add_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_16(a.v128[1], b.v128[1]),
+                          c_v128_add_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_32(a.v128[1], b.v128[1]),
+                          c_v128_add_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
+  c_v256 t;
+  t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
+  t.s32[1] = (int32_t)a.s16[2] + (int32_t)a.s16[3];
+  t.s32[2] = (int32_t)a.s16[4] + (int32_t)a.s16[5];
+  t.s32[3] = (int32_t)a.s16[6] + (int32_t)a.s16[7];
+  t.s32[4] = (int32_t)a.s16[8] + (int32_t)a.s16[9];
+  t.s32[5] = (int32_t)a.s16[10] + (int32_t)a.s16[11];
+  t.s32[6] = (int32_t)a.s16[12] + (int32_t)a.s16[13];
+  t.s32[7] = (int32_t)a.s16[14] + (int32_t)a.s16[15];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_sub_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_8(a.v128[1], b.v128[1]),
+                          c_v128_sub_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s8(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_16(a.v128[1], b.v128[1]),
+                          c_v128_sub_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_s16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ssub_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ssub_u16(a.v128[1], b.v128[1]),
+                          c_v128_ssub_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_32(a.v128[1], b.v128[1]),
+                          c_v128_sub_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s16(a.v128[1]), c_v128_abs_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_abs_s8(c_v256 a) {
+  return c_v256_from_v128(c_v128_abs_s8(a.v128[1]), c_v128_abs_s8(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mul_s16(c_v128 a, c_v128 b) {
+  c_v128 lo_bits = c_v128_mullo_s16(a, b);
+  c_v128 hi_bits = c_v128_mulhi_s16(a, b);
+  return c_v256_from_v128(c_v128_ziphi_16(hi_bits, lo_bits),
+                          c_v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s16(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mulhi_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mulhi_s16(a.v128[1], b.v128[1]),
+                          c_v128_mulhi_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_mullo_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_mullo_s32(a.v128[1], b.v128[1]),
+                          c_v128_mullo_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_s16(a.v128[1], b.v128[1]),
+                          c_v128_madd_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_madd_us8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_madd_us8(a.v128[1], b.v128[1]),
+                          c_v128_madd_us8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u8(a.v128[1], b.v128[1]),
+                          c_v128_avg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u8(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
+                          c_v128_avg_u16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_u8(a.v128[1], b.v128[1]),
+                          c_v128_min_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_u8(a.v128[1], b.v128[1]),
+                          c_v128_max_u8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s8(a.v128[1], b.v128[1]),
+                          c_v128_min_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((uint32_t)(a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
+                          c_v128_max_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s16(a.v128[1], b.v128[1]),
+                          c_v128_min_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s16(a.v128[1], b.v128[1]),
+                          c_v128_max_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_8(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_16(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_32(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[0], b.v128[0]),
+                          c_v128_ziplo_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_ziphi_64(a.v128[1], b.v128[1]),
+                          c_v128_ziplo_64(a.v128[1], b.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_ziplo_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[0], b.v128[0]);
+}
+
+SIMD_INLINE c_v256 c_v256_ziphi_128(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(a.v128[1], b.v128[1]);
+}
+
+SIMD_INLINE c_v256 c_v256_zip_8(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_8(a, b), c_v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_16(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_16(a, b), c_v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE c_v256 c_v256_zip_32(c_v128 a, c_v128 b) {
+  return c_v256_from_v128(c_v128_ziphi_32(a, b), c_v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_8(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = a.u8[i * 2 + 1];
+      t.u8[i + 16] = b.u8[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 16; i++) {
+      t.u8[i] = b.u8[i * 2];
+      t.u8[i + 16] = a.u8[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(a, b, 1)
+                           : _c_v256_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_8(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_8(b, a, 0)
+                           : _c_v256_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_16(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  int i;
+  if (mode) {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = a.u16[i * 2 + 1];
+      t.u16[i + 8] = b.u16[i * 2 + 1];
+    }
+  } else {
+    for (i = 0; i < 8; i++) {
+      t.u16[i] = b.u16[i * 2];
+      t.u16[i + 8] = a.u16[i * 2];
+    }
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(a, b, 1)
+                           : _c_v256_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_16(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_16(b, a, 0)
+                           : _c_v256_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_32(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u32[7] = b.u32[7];
+    t.u32[6] = b.u32[5];
+    t.u32[5] = b.u32[3];
+    t.u32[4] = b.u32[1];
+    t.u32[3] = a.u32[7];
+    t.u32[2] = a.u32[5];
+    t.u32[1] = a.u32[3];
+    t.u32[0] = a.u32[1];
+  } else {
+    t.u32[7] = a.u32[6];
+    t.u32[6] = a.u32[4];
+    t.u32[5] = a.u32[2];
+    t.u32[4] = a.u32[0];
+    t.u32[3] = b.u32[6];
+    t.u32[2] = b.u32[4];
+    t.u32[1] = b.u32[2];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(a, b, 1)
+                           : _c_v256_unzip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_32(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_32(b, a, 0)
+                           : _c_v256_unzip_32(b, a, 1);
+}
+
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[0]),
+                          c_v128_unpacklo_u8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u8_s16(a.v128[1]),
+                          c_v128_unpacklo_u8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s8_s16(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a), c_v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[0]),
+                          c_v128_unpacklo_s8_s16(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s8_s16(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s8_s16(a.v128[1]),
+                          c_v128_unpacklo_s8_s16(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_s16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_pack_s16_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s16_s8(a.v128[1], a.v128[0]),
+                          c_v128_pack_s16_s8(b.v128[1], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_u16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a),
+                          c_v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpack_s16_s32(c_v128 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a),
+                          c_v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[0]),
+                          c_v128_unpacklo_u16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpacklo_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[0]),
+                          c_v128_unpacklo_s16_s32(a.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_u16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_u16_s32(a.v128[1]),
+                          c_v128_unpacklo_u16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_unpackhi_s16_s32(c_v256 a) {
+  return c_v256_from_v128(c_v128_unpackhi_s16_s32(a.v128[1]),
+                          c_v128_unpacklo_s16_s32(a.v128[1]));
+}
+
+SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                     : pattern.u8[c] & 31];
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
+  return t;
+}
+
+// Pairwise / dual-lane shuffle: shuffle two 128 bit lates.
+SIMD_INLINE c_v256 c_v256_pshuffle_8(c_v256 a, c_v256 pattern) {
+  return c_v256_from_v128(
+      c_v128_shuffle_8(c_v256_high_v128(a), c_v256_high_v128(pattern)),
+      c_v128_shuffle_8(c_v256_low_v128(a), c_v256_low_v128(pattern)));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s8(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_8(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s16(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_16(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_16(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
+  if (n < 16)
+    return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
+                                      c_v128_shr_n_byte(a.v128[0], 16 - n)),
+                            c_v128_shl_n_byte(a.v128[0], n));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_shl_n_byte(a.v128[0], n - 16),
+                            c_v128_zero());
+  else
+    return c_v256_from_v128(c_v256_low_v128(a), c_v128_zero());
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_byte(c_v256 a, unsigned int n) {
+  if (n == 0) return a;
+  if (n < 16)
+    return c_v256_from_v128(c_v128_shr_n_byte(a.v128[1], n),
+                            c_v128_or(c_v128_shr_n_byte(a.v128[0], n),
+                                      c_v128_shl_n_byte(a.v128[1], 16 - n)));
+  else if (n > 16)
+    return c_v256_from_v128(c_v128_zero(),
+                            c_v128_shr_n_byte(a.v128[1], n - 16));
+  else
+    return c_v256_from_v128(c_v128_zero(), c_v256_high_v128(a));
+}
+
+SIMD_INLINE c_v256 c_v256_align(c_v256 a, c_v256 b, unsigned int c) {
+  if (SIMD_CHECK && c > 31) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v256_or(c_v256_shr_n_byte(b, c), c_v256_shl_n_byte(a, 32 - c))
+           : b;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_8(a.v128[1], c),
+                          c_v128_shl_8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u8(a.v128[1], c),
+                          c_v128_shr_u8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s8(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s8(a.v128[1], c),
+                          c_v128_shr_s8(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_16(a.v128[1], c),
+                          c_v128_shl_16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u16(a.v128[1], c),
+                          c_v128_shr_u16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s16(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s16(a.v128[1], c),
+                          c_v128_shr_s16(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shl_32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shl_32(a.v128[1], c),
+                          c_v128_shl_32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_u32(a.v128[1], c),
+                          c_v128_shr_u32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s32(c_v256 a, unsigned int c) {
+  return c_v256_from_v128(c_v128_shr_s32(a.v128[1], c),
+                          c_v128_shr_s32(a.v128[0], c));
+}
+
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
+  return c_v256_shl_8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_16(c_v256 a, unsigned int n) {
+  return c_v256_shl_16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_32(c_v256 a, unsigned int n) {
+  return c_v256_shl_32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
+  return c_v256_shr_u8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u16(c_v256 a, unsigned int n) {
+  return c_v256_shr_u16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u32(c_v256 a, unsigned int n) {
+  return c_v256_shr_u32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
+  return c_v256_shr_s8(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s16(c_v256 a, unsigned int n) {
+  return c_v256_shr_s16(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s32(c_v256 a, unsigned int n) {
+  return c_v256_shr_s32(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
new file mode 100644
index 0000000000..493130df83
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_v128.h
@@ -0,0 +1,806 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
+
+#include "config/aom_config.h"
+
+#if HAVE_NEON
+#error "Do not use this file for Neon"
+#endif
+
+#if HAVE_SSE2
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+#else
+#include "aom_dsp/simd/v128_intrinsics.h"
+#endif
+
+typedef struct {
+  v128 val[2];
+} v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
+
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
+
+SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
+  v256 t;
+  t.val[1] = hi;
+  t.val[0] = lo;
+  return t;
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return v256_from_v128(v128_from_64(a, b), v128_from_64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return v256_from_v128(v128_load_unaligned((uint8_t *)p + 16),
+                        v128_load_unaligned(p));
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return v256_from_v128(v128_load_aligned((uint8_t *)p + 16),
+                        v128_load_aligned(p));
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
+}
+
+SIMD_INLINE v256 v256_zero(void) {
+  return v256_from_v128(v128_zero(), v128_zero());
+}
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) {
+  v128 t = v128_dup_8(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) {
+  v128 t = v128_dup_16(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) {
+  v128 t = v128_dup_32(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
+}
+
+typedef struct {
+  sad128_internal val[2];
+} sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 16 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  sad256_internal t;
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal val[2];
+} ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  ssd256_internal t;
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) {
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) {
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) {
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) {
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return v256_from_v128(a.val[1], b.val[1]);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a), v128_unpacklo_u8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a), v128_unpacklo_u16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a), v128_unpacklo_s16_s32(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return v256_from_v128(
+      v128_shuffle_8(v256_high_v128(a), v256_high_v128(pattern)),
+      v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
+                  v128_zero()))
+
+#define v256_shr_n_byte(a, n)                                                \
+  (n == 0                                                                    \
+       ? a                                                                   \
+       : ((n) < 16                                                           \
+              ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                               v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                       v128_shl_n_byte(a.val[1], 16 - (n)))) \
+              : v256_from_v128(                                              \
+                    v128_zero(),                                             \
+                    (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1])))
+
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, n) \
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
+#define v256_shl_n_16(a, n) \
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
+#define v256_shl_n_32(a, n) \
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
+#define v256_shr_n_u8(a, n) \
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
+#define v256_shr_n_u16(a, n) \
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
+#define v256_shr_n_u32(a, n) \
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
+#define v256_shr_n_s8(a, n) \
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
+#define v256_shr_n_s16(a, n) \
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
+#define v256_shr_n_s32(a, n) \
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_V128_H_
diff --git a/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
new file mode 100644
index 0000000000..894ddee167
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v256_intrinsics_x86.h
@@ -0,0 +1,754 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
+
+#if !defined(__AVX2__)
+
+#include "aom_dsp/simd/v256_intrinsics_v128.h"
+
+#else
+
+// The _m256i type seems to cause problems for g++'s mangling prior to
+// version 5, but adding -fabi-version=0 fixes this.
+#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 5 && \
+    defined(__AVX2__) && defined(__cplusplus)
+#pragma GCC optimize "-fabi-version=0"
+#endif
+
+#include <immintrin.h>
+
+#include "aom_dsp/simd/v128_intrinsics_x86.h"
+
+typedef __m256i v256;
+
+SIMD_INLINE uint32_t v256_low_u32(v256 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm256_extracti128_si256(a, 0));
+}
+
+SIMD_INLINE v64 v256_low_v64(v256 a) {
+  return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
+}
+
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) {
+  return _mm256_extracti128_si256(a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
+  // gcc seems to be missing _mm256_set_m128i()
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
+}
+
+SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
+  return v256_from_v128(v128_from_v64(a, b), v128_from_v64(c, d));
+}
+
+SIMD_INLINE v256 v256_from_64(uint64_t a, uint64_t b, uint64_t c, uint64_t d) {
+  return _mm256_set_epi64x((int64_t)a, (int64_t)b, (int64_t)c, (int64_t)d);
+}
+
+SIMD_INLINE v256 v256_load_aligned(const void *p) {
+  return _mm256_load_si256((const __m256i *)p);
+}
+
+SIMD_INLINE v256 v256_load_unaligned(const void *p) {
+  return _mm256_loadu_si256((const __m256i *)p);
+}
+
+SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
+  _mm256_store_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
+  _mm256_storeu_si256((__m256i *)p, a);
+}
+
+SIMD_INLINE v256 v256_zero(void) { return _mm256_setzero_si256(); }
+
+SIMD_INLINE v256 v256_dup_8(uint8_t x) { return _mm256_set1_epi8((char)x); }
+
+SIMD_INLINE v256 v256_dup_16(uint16_t x) { return _mm256_set1_epi16((short)x); }
+
+SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32((int)x); }
+
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  return _mm256_set1_epi64x((int64_t)x);
+}
+
+SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
+
+SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
+  return _mm256_adds_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
+SIMD_INLINE v256 v256_padd_s16(v256 a) {
+  return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
+}
+
+SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return _mm256_sub_epi8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return _mm256_subs_epu8(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) { return _mm256_subs_epi8(a, b); }
+
+SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) { return _mm256_sub_epi16(a, b); }
+
+SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
+  return _mm256_subs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
+  return _mm256_subs_epu16(a, b);
+}
+
+SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
+SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
+
+SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
+
+// AVX doesn't have the direct intrinsics to zip/unzip 8, 16, 32 bit
+// lanes of lower or upper halves of a 256bit vector because the
+// unpack/pack intrinsics operate on the 256 bit input vector as 2
+// independent 128 bit vectors.
+SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
+}
+
+SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
+  return _mm256_permute2x128_si256(a, b, 0x02);
+}
+
+SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
+  return _mm256_permute2x128_si256(a, b, 0x13);
+}
+
+SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_8(a, b), v128_ziplo_8(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_16(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_16(a, b), v128_ziplo_16(a, b));
+}
+
+SIMD_INLINE v256 v256_zip_32(v128 a, v128 b) {
+  return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
+}
+
+SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
+}
+
+SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
+}
+
+SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return _mm256_cvtepu8_epi16(a); }
+
+SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
+  return v256_from_v128(v128_unpackhi_s8_s16(a), v128_unpacklo_s8_s16(a));
+}
+
+SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
+}
+
+SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
+  return _mm256_cvtepu16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpack_s16_s32(v128 a) {
+  return _mm256_cvtepi16_epi32(a);
+}
+
+SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
+}
+
+SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
+}
+
+SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
+  return _mm256_shuffle_epi8(a, pattern);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
+SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
+  v256 r = _mm256_madd_epi16(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
+SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
+  v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
+  v128 lo = v256_low_v128(t);
+  v128 hi = v256_high_v128(t);
+  lo = v128_add_32(lo, hi);
+  return v64_low_u32(v128_low_v64(lo)) + v128_low_u32(v128_high_v64(lo));
+}
+
+typedef v256 sad256_internal;
+
+SIMD_INLINE sad256_internal v256_sad_u8_init(void) {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u8_sum().
+   The result for more than 32 v256_sad_u8() calls is undefined. */
+SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
+  return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+typedef v256 ssd256_internal;
+
+SIMD_INLINE ssd256_internal v256_ssd_u8_init(void) {
+  return _mm256_setzero_si256();
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_u8_sum(). */
+SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
+  v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
+  v256 h = _mm256_sub_epi16(_mm256_unpackhi_epi8(a, _mm256_setzero_si256()),
+                            _mm256_unpackhi_epi8(b, _mm256_setzero_si256()));
+  v256 rl = _mm256_madd_epi16(l, l);
+  v256 rh = _mm256_madd_epi16(h, h);
+  v128 c = _mm_cvtsi32_si128(32);
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 8));
+  rl = _mm256_add_epi32(rl, _mm256_srli_si256(rl, 4));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 8));
+  rh = _mm256_add_epi32(rh, _mm256_srli_si256(rh, 4));
+  return _mm256_add_epi64(
+      s,
+      _mm256_srl_epi64(_mm256_sll_epi64(_mm256_unpacklo_epi64(rl, rh), c), c));
+}
+
+SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
+  v256 t = _mm256_add_epi32(s, _mm256_unpackhi_epi64(s, s));
+  return v128_low_u32(_mm_add_epi32(v256_high_v128(t), v256_low_v128(t)));
+}
+
+SIMD_INLINE v256 v256_or(v256 a, v256 b) { return _mm256_or_si256(a, b); }
+
+SIMD_INLINE v256 v256_xor(v256 a, v256 b) { return _mm256_xor_si256(a, b); }
+
+SIMD_INLINE v256 v256_and(v256 a, v256 b) { return _mm256_and_si256(a, b); }
+
+SIMD_INLINE v256 v256_andn(v256 a, v256 b) { return _mm256_andnot_si256(b, a); }
+
+SIMD_INLINE v256 v256_mul_s16(v64 a, v64 b) {
+  v128 lo_bits = v128_mullo_s16(a, b);
+  v128 hi_bits = v128_mulhi_s16(a, b);
+  return v256_from_v128(v128_ziphi_16(hi_bits, lo_bits),
+                        v128_ziplo_16(hi_bits, lo_bits));
+}
+
+SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
+  return _mm256_mullo_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
+  return _mm256_mulhi_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
+  return _mm256_mullo_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
+  return _mm256_madd_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
+  return _mm256_maddubs_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return _mm256_avg_epu8(a, b); }
+
+SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
+  return _mm256_sub_epi8(
+      _mm256_avg_epu8(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
+SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
+
+SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
+
+SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return _mm256_max_epu8(a, b); }
+
+SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (uint32_t)_mm256_movemask_epi8(a);
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
+SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
+
+SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
+
+SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return _mm256_cmpgt_epi8(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return _mm256_cmpeq_epi8(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return _mm256_cmpgt_epi16(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return _mm256_cmpeq_epi16(a, b);
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff << c)),
+                          _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
+  return _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> c)),
+                          _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
+  __m128i x = _mm_cvtsi32_si128((int)(c + 8));
+  return _mm256_packs_epi16(_mm256_sra_epi16(_mm256_unpacklo_epi8(a, a), x),
+                            _mm256_sra_epi16(_mm256_unpackhi_epi8(a, a), x));
+}
+
+SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
+  return _mm256_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
+  return _mm256_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
+  return _mm256_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
+  return _mm256_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
+  return _mm256_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
+  return _mm256_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512VL__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128((int)c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+// _mm256_slli_si256 works on 128 bit lanes and can't be used
+#define v256_shl_n_byte(a, n)                                                \
+  ((n) < 16 ? v256_from_v128(                                                \
+                  v128_align(v256_high_v128(a), v256_low_v128(a), 16 - (n)), \
+                  v128_shl_n_byte(v256_low_v128(a), n))                      \
+            : _mm256_inserti128_si256(                                       \
+                  _mm256_setzero_si256(),                                    \
+                  v128_shl_n_byte(v256_low_v128(a), (n)-16), 1))
+
+// _mm256_srli_si256 works on 128 bit lanes and can't be used
+#define v256_shr_n_byte(a, n)                                                 \
+  ((n) < 16                                                                   \
+       ? _mm256_alignr_epi8(                                                  \
+             _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(2, 0, 0, 1)), a, n)  \
+       : ((n) == 16 ? _mm256_permute2x128_si256(_mm256_setzero_si256(), a, 3) \
+                    : _mm256_inserti128_si256(                                \
+                          _mm256_setzero_si256(),                             \
+                          v128_shr_n_byte(v256_high_v128(a), (n)-16), 0)))
+
+// _mm256_alignr_epi8 works on two 128 bit lanes and can't be used
+#define v256_align(a, b, c) \
+  ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
+
+#define v256_shl_n_8(a, c)                                \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff << (c))), \
+                   _mm256_slli_epi16(a, c))
+#define v256_shr_n_u8(a, c)                               \
+  _mm256_and_si256(_mm256_set1_epi8((char)(0xff >> (c))), \
+                   _mm256_srli_epi16(a, c))
+#define v256_shr_n_s8(a, c)                                                  \
+  _mm256_packs_epi16(_mm256_srai_epi16(_mm256_unpacklo_epi8(a, a), (c) + 8), \
+                     _mm256_srai_epi16(_mm256_unpackhi_epi8(a, a), (c) + 8))
+#define v256_shl_n_16(a, c) _mm256_slli_epi16(a, c)
+#define v256_shr_n_u16(a, c) _mm256_srli_epi16(a, c)
+#define v256_shr_n_s16(a, c) _mm256_srai_epi16(a, c)
+#define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
+#define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
+#define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init(void) { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
+#endif
+
+#endif  // AOM_AOM_DSP_SIMD_V256_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics.h b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
new file mode 100644
index 0000000000..7079949cd8
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics.h
@@ -0,0 +1,234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "aom_dsp/simd/v64_intrinsics_c.h"
+
+/* Fallback to plain, unoptimised C. */
+
+typedef c_v64 v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) { return c_v64_low_u32(a); }
+SIMD_INLINE uint32_t v64_high_u32(v64 a) { return c_v64_high_u32(a); }
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return c_v64_low_s32(a); }
+SIMD_INLINE int32_t v64_high_s32(v64 a) { return c_v64_high_s32(a); }
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return c_v64_from_32(x, y);
+}
+SIMD_INLINE v64 v64_from_64(uint64_t x) { return c_v64_from_64(x); }
+SIMD_INLINE uint64_t v64_u64(v64 x) { return c_v64_u64(x); }
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return c_v64_from_16(a, b, c, d);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return c_u32_load_unaligned(p);
+}
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return c_u32_load_aligned(p);
+}
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  c_u32_store_unaligned(p, a);
+}
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  c_u32_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return c_v64_load_unaligned(p);
+}
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return c_v64_load_aligned(p);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  c_v64_store_unaligned(p, a);
+}
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  c_v64_store_aligned(p, a);
+}
+
+SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) {
+  return c_v64_align(a, b, c);
+}
+
+SIMD_INLINE v64 v64_zero(void) { return c_v64_zero(); }
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return c_v64_dup_8(x); }
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return c_v64_dup_16(x); }
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return c_v64_dup_32(x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return c_v64_ssub_u8(a, b); }
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return c_v64_ssub_s8(a, b); }
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return c_v64_sub_16(a, b); }
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return c_v64_ssub_s16(a, b); }
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return c_v64_ssub_u16(a, b); }
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return c_v64_sub_32(a, b); }
+SIMD_INLINE v64 v64_abs_s16(v64 a) { return c_v64_abs_s16(a); }
+SIMD_INLINE v64 v64_abs_s8(v64 a) { return c_v64_abs_s8(a); }
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return c_v64_ziplo_8(a, b); }
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) { return c_v64_ziphi_8(a, b); }
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return c_v64_ziplo_16(a, b); }
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) { return c_v64_ziphi_16(a, b); }
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return c_v64_ziplo_32(a, b); }
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) { return c_v64_ziphi_32(a, b); }
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) { return c_v64_unziplo_8(a, b); }
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) { return c_v64_unziphi_8(a, b); }
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) { return c_v64_unziplo_16(a, b); }
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) { return c_v64_unziphi_16(a, b); }
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { return c_v64_unpacklo_u8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { return c_v64_unpackhi_u8_s16(a); }
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { return c_v64_unpacklo_s8_s16(a); }
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { return c_v64_unpackhi_s8_s16(a); }
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  return c_v64_pack_s32_s16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  return c_v64_pack_s16_u8(a, b);
+}
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  return c_v64_pack_s16_s8(a, b);
+}
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return c_v64_unpacklo_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return c_v64_unpacklo_s16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return c_v64_unpackhi_u16_s32(a);
+}
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return c_v64_unpackhi_s16_s32(a);
+}
+SIMD_INLINE v64 v64_shuffle_8(v64 a, v64 pattern) {
+  return c_v64_shuffle_8(a, pattern);
+}
+
+SIMD_INLINE c_sad64_internal v64_sad_u8_init(void) {
+  return c_v64_sad_u8_init();
+}
+SIMD_INLINE c_sad64_internal v64_sad_u8(c_sad64_internal s, v64 a, v64 b) {
+  return c_v64_sad_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_sad_u8_sum(c_sad64_internal s) {
+  return c_v64_sad_u8_sum(s);
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8_init(void) {
+  return c_v64_ssd_u8_init();
+}
+SIMD_INLINE c_ssd64_internal v64_ssd_u8(c_ssd64_internal s, v64 a, v64 b) {
+  return c_v64_ssd_u8(s, a, b);
+}
+SIMD_INLINE uint32_t v64_ssd_u8_sum(c_ssd64_internal s) {
+  return c_v64_ssd_u8_sum(s);
+}
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) { return c_v64_dotp_su8(a, b); }
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) { return c_v64_dotp_s16(a, b); }
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) { return c_v64_hadd_u8(a); }
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) { return c_v64_hadd_s16(a); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return c_v64_or(a, b); }
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return c_v64_xor(a, b); }
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return c_v64_and(a, b); }
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return c_v64_andn(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return c_v64_mullo_s16(a, b); }
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return c_v64_mulhi_s16(a, b); }
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) { return c_v64_mullo_s32(a, b); }
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return c_v64_madd_s16(a, b); }
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) { return c_v64_madd_us8(a, b); }
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) { return c_v64_min_s8(a, b); }
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) { return c_v64_max_s8(a, b); }
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return c_v64_min_s16(a, b); }
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return c_v64_max_s16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return c_v64_cmpgt_s8(a, b); }
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return c_v64_cmplt_s8(a, b); }
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return c_v64_cmpeq_8(a, b); }
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return c_v64_cmpgt_s16(a, b); }
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return c_v64_cmplt_s16(a, b); }
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return c_v64_cmpeq_16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int n) { return c_v64_shl_8(a, n); }
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int n) { return c_v64_shr_u8(a, n); }
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int n) { return c_v64_shr_s8(a, n); }
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int n) { return c_v64_shl_16(a, n); }
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int n) {
+  return c_v64_shr_u16(a, n);
+}
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int n) {
+  return c_v64_shr_s16(a, n);
+}
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int n) { return c_v64_shl_32(a, n); }
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int n) {
+  return c_v64_shr_u32(a, n);
+}
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int n) {
+  return c_v64_shr_s32(a, n);
+}
+SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int n) {
+  return c_v64_shr_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int n) {
+  return c_v64_shl_n_byte(a, n);
+}
+SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) {
+  return c_v64_shl_n_8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) {
+  return c_v64_shr_n_u8(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) {
+  return c_v64_shr_n_s8(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) {
+  return c_v64_shl_n_16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) {
+  return c_v64_shr_n_u16(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) {
+  return c_v64_shr_n_s16(a, c);
+}
+SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) {
+  return c_v64_shl_n_32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) {
+  return c_v64_shr_n_u32(a, c);
+}
+SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) {
+  return c_v64_shr_n_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
new file mode 100644
index 0000000000..bfd6fe0710
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_c.h
@@ -0,0 +1,966 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
+
+/* Note: This implements the intrinsics in plain, unoptimised C.
+   Intended for reference, porting or debugging. */
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+
+typedef union {
+  uint8_t u8[8];
+  uint16_t u16[4];
+  uint32_t u32[2];
+  uint64_t u64;
+  int8_t s8[8];
+  int16_t s16[4];
+  int32_t s32[2];
+  int64_t s64;
+} c_v64;
+
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
+  return a.u32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
+  return a.s32[!CONFIG_BIG_ENDIAN];
+}
+
+SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
+  c_v64 t;
+  t.u32[!CONFIG_BIG_ENDIAN] = x;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_from_64(uint64_t x) {
+  c_v64 t;
+  t.u64 = x;
+  return t;
+}
+
+SIMD_INLINE uint64_t c_v64_u64(c_v64 x) { return x.u64; }
+
+SIMD_INLINE c_v64 c_v64_from_16(uint16_t a, uint16_t b, uint16_t c,
+                                uint16_t d) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    t.u16[0] = a;
+    t.u16[1] = b;
+    t.u16[2] = c;
+    t.u16[3] = d;
+  } else {
+    t.u16[3] = a;
+    t.u16[2] = b;
+    t.u16[1] = c;
+    t.u16[0] = d;
+  }
+  return t;
+}
+
+SIMD_INLINE uint32_t c_u32_load_unaligned(const void *p) {
+  uint32_t t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 4; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE void c_u32_store_unaligned(void *p, uint32_t a) {
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 4; c++) pp[c] = q[c];
+}
+
+SIMD_INLINE uint32_t c_u32_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 load at %p\n", p);
+    abort();
+  }
+  return c_u32_load_unaligned(p);
+}
+
+SIMD_INLINE void c_u32_store_aligned(void *p, uint32_t a) {
+  if (SIMD_CHECK && (uintptr_t)p & 3) {
+    fprintf(stderr, "Error: Unaligned u32 store at %p\n", p);
+    abort();
+  }
+  c_u32_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_load_unaligned(const void *p) {
+  c_v64 t;
+  uint8_t *pp = (uint8_t *)p;
+  uint8_t *q = (uint8_t *)&t;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = pp[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_load_aligned(const void *p) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 load at %p\n", p);
+    abort();
+  }
+  return c_v64_load_unaligned(p);
+}
+
+SIMD_INLINE void c_v64_store_unaligned(void *p, c_v64 a) {
+  uint8_t *q = (uint8_t *)p;
+  uint8_t *r = (uint8_t *)&a;
+  int c;
+  for (c = 0; c < 8; c++) q[c] = r[c];
+}
+
+SIMD_INLINE void c_v64_store_aligned(void *p, c_v64 a) {
+  if (SIMD_CHECK && (uintptr_t)p & 7) {
+    fprintf(stderr, "Error: Unaligned c_v64 store at %p\n", p);
+    abort();
+  }
+  c_v64_store_unaligned(p, a);
+}
+
+SIMD_INLINE c_v64 c_v64_zero(void) {
+  c_v64 t;
+  t.u64 = 0;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_8(uint8_t x) {
+  c_v64 t;
+  t.u8[0] = t.u8[1] = t.u8[2] = t.u8[3] = t.u8[4] = t.u8[5] = t.u8[6] =
+      t.u8[7] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_16(uint16_t x) {
+  c_v64 t;
+  t.u16[0] = t.u16[1] = t.u16[2] = t.u16[3] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_dup_32(uint32_t x) {
+  c_v64 t;
+  t.u32[0] = t.u32[1] = x;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] + b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] + b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = SIMD_CLAMP((int16_t)a.u8[c] + (int16_t)b.u8[c], 0, 255);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = SIMD_CLAMP((int16_t)a.s8[c] + (int16_t)b.s8[c], -128, 127);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] + (int32_t)b.s16[c], -32768, 32767);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_add_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((uint64_t)a.u32[0] + b.u32[0]);
+  t.u32[1] = (uint32_t)((uint64_t)a.u32[1] + b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (uint8_t)(a.u8[c] - b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    int16_t d = (int16_t)a.s8[c] - (int16_t)b.s8[c];
+    t.s8[c] = SIMD_CLAMP(d, -128, 127);
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] - b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.s16[c] = SIMD_CLAMP((int32_t)a.s16[c] - (int32_t)b.s16[c], -32768, 32767);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ssub_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] =
+        (int32_t)a.u16[c] - (int32_t)b.u16[c] < 0 ? 0 : a.u16[c] - b.u16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sub_32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u32[0] = (uint32_t)((int64_t)a.u32[0] - b.u32[0]);
+  t.u32[1] = (uint32_t)((int64_t)a.u32[1] - b.u32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s16(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++)
+    t.u16[c] = (uint16_t)((int16_t)a.u16[c] > 0 ? a.u16[c] : -a.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_abs_s8(c_v64 a) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (uint8_t)((int8_t)a.u8[c] > 0 ? a.u8[c] : -a.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = a.u8[7];
+    t.u8[6] = b.u8[7];
+    t.u8[5] = a.u8[6];
+    t.u8[4] = b.u8[6];
+    t.u8[3] = a.u8[5];
+    t.u8[2] = b.u8[5];
+    t.u8[1] = a.u8[4];
+    t.u8[0] = b.u8[4];
+  } else {
+    t.u8[7] = a.u8[3];
+    t.u8[6] = b.u8[3];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = b.u8[2];
+    t.u8[3] = a.u8[1];
+    t.u8[2] = b.u8[1];
+    t.u8[1] = a.u8[0];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 1) : _c_v64_zip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_8(b, a, 0) : _c_v64_zip_8(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = a.u16[3];
+    t.u16[2] = b.u16[3];
+    t.u16[1] = a.u16[2];
+    t.u16[0] = b.u16[2];
+  } else {
+    t.u16[3] = a.u16[1];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[0];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 1) : _c_v64_zip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_16(b, a, 0) : _c_v64_zip_16(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_zip_32(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u32[1] = a.u32[1];
+    t.u32[0] = b.u32[1];
+  } else {
+    t.u32[1] = a.u32[0];
+    t.u32[0] = b.u32[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_ziplo_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 1) : _c_v64_zip_32(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_ziphi_32(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_zip_32(b, a, 0) : _c_v64_zip_32(a, b, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_8(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u8[7] = b.u8[7];
+    t.u8[6] = b.u8[5];
+    t.u8[5] = b.u8[3];
+    t.u8[4] = b.u8[1];
+    t.u8[3] = a.u8[7];
+    t.u8[2] = a.u8[5];
+    t.u8[1] = a.u8[3];
+    t.u8[0] = a.u8[1];
+  } else {
+    t.u8[7] = a.u8[6];
+    t.u8[6] = a.u8[4];
+    t.u8[5] = a.u8[2];
+    t.u8[4] = a.u8[0];
+    t.u8[3] = b.u8[6];
+    t.u8[2] = b.u8[4];
+    t.u8[1] = b.u8[2];
+    t.u8[0] = b.u8[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(a, b, 1) : _c_v64_unzip_8(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_8(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_8(b, a, 0) : _c_v64_unzip_8(b, a, 1);
+}
+
+SIMD_INLINE c_v64 _c_v64_unzip_16(c_v64 a, c_v64 b, int mode) {
+  c_v64 t;
+  if (mode) {
+    t.u16[3] = b.u16[3];
+    t.u16[2] = b.u16[1];
+    t.u16[1] = a.u16[3];
+    t.u16[0] = a.u16[1];
+  } else {
+    t.u16[3] = a.u16[2];
+    t.u16[2] = a.u16[0];
+    t.u16[1] = b.u16[2];
+    t.u16[0] = b.u16[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unziplo_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(a, b, 1)
+                           : _c_v64_unzip_16(a, b, 0);
+}
+
+SIMD_INLINE c_v64 c_v64_unziphi_16(c_v64 a, c_v64 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v64_unzip_16(b, a, 0)
+                           : _c_v64_unzip_16(b, a, 1);
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[3 + endian];
+  t.s16[2] = (int16_t)a.u8[2 + endian];
+  t.s16[1] = (int16_t)a.u8[1 + endian];
+  t.s16[0] = (int16_t)a.u8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.u8[7 - endian];
+  t.s16[2] = (int16_t)a.u8[6 - endian];
+  t.s16[1] = (int16_t)a.u8[5 - endian];
+  t.s16[0] = (int16_t)a.u8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[3 + endian];
+  t.s16[2] = (int16_t)a.s8[2 + endian];
+  t.s16[1] = (int16_t)a.s8[1 + endian];
+  t.s16[0] = (int16_t)a.s8[0 + endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s8_s16(c_v64 a) {
+  c_v64 t;
+  int endian = !!CONFIG_BIG_ENDIAN * 4;
+  t.s16[3] = (int16_t)a.s8[7 - endian];
+  t.s16[2] = (int16_t)a.s8[6 - endian];
+  t.s16[1] = (int16_t)a.s8[5 - endian];
+  t.s16[0] = (int16_t)a.s8[4 - endian];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.s16[3] = SIMD_CLAMP(a.s32[1], -32768, 32767);
+  t.s16[2] = SIMD_CLAMP(a.s32[0], -32768, 32767);
+  t.s16[1] = SIMD_CLAMP(b.s32[1], -32768, 32767);
+  t.s16[0] = SIMD_CLAMP(b.s32[0], -32768, 32767);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = SIMD_CLAMP(a.s32[1], 0, 65535);
+  t.u16[2] = SIMD_CLAMP(a.s32[0], 0, 65535);
+  t.u16[1] = SIMD_CLAMP(b.s32[1], 0, 65535);
+  t.u16[0] = SIMD_CLAMP(b.s32[0], 0, 65535);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u8[7] = SIMD_CLAMP(a.s16[3], 0, 255);
+  t.u8[6] = SIMD_CLAMP(a.s16[2], 0, 255);
+  t.u8[5] = SIMD_CLAMP(a.s16[1], 0, 255);
+  t.u8[4] = SIMD_CLAMP(a.s16[0], 0, 255);
+  t.u8[3] = SIMD_CLAMP(b.s16[3], 0, 255);
+  t.u8[2] = SIMD_CLAMP(b.s16[2], 0, 255);
+  t.u8[1] = SIMD_CLAMP(b.s16[1], 0, 255);
+  t.u8[0] = SIMD_CLAMP(b.s16[0], 0, 255);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_pack_s16_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.s8[7] = SIMD_CLAMP(a.s16[3], -128, 127);
+  t.s8[6] = SIMD_CLAMP(a.s16[2], -128, 127);
+  t.s8[5] = SIMD_CLAMP(a.s16[1], -128, 127);
+  t.s8[4] = SIMD_CLAMP(a.s16[0], -128, 127);
+  t.s8[3] = SIMD_CLAMP(b.s16[3], -128, 127);
+  t.s8[2] = SIMD_CLAMP(b.s16[2], -128, 127);
+  t.s8[1] = SIMD_CLAMP(b.s16[1], -128, 127);
+  t.s8[0] = SIMD_CLAMP(b.s16[0], -128, 127);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpacklo_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[1 + !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[0 + !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_u16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.u16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.u16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_unpackhi_s16_s32(c_v64 a) {
+  c_v64 t;
+  t.s32[1] = a.s16[3 - !!CONFIG_BIG_ENDIAN * 2];
+  t.s32[0] = a.s16[2 - !!CONFIG_BIG_ENDIAN * 2];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shuffle_8(c_v64 a, c_v64 pattern) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) {
+    if (SIMD_CHECK && (pattern.u8[c] & ~7)) {
+      fprintf(stderr, "Error: Undefined v64_shuffle_8 index %d/%d\n",
+              pattern.u8[c], c);
+      abort();
+    }
+    t.u8[c] =
+        a.u8[CONFIG_BIG_ENDIAN ? 7 - (pattern.u8[c] & 7) : pattern.u8[c] & 7];
+  }
+  return t;
+}
+
+SIMD_INLINE int64_t c_v64_dotp_su8(c_v64 a, c_v64 b) {
+  return a.s8[7] * b.u8[7] + a.s8[6] * b.u8[6] + a.s8[5] * b.u8[5] +
+         a.s8[4] * b.u8[4] + a.s8[3] * b.u8[3] + a.s8[2] * b.u8[2] +
+         a.s8[1] * b.u8[1] + a.s8[0] * b.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_dotp_s16(c_v64 a, c_v64 b) {
+  return (int64_t)(a.s16[3] * b.s16[3] + a.s16[2] * b.s16[2]) +
+         (int64_t)(a.s16[1] * b.s16[1] + a.s16[0] * b.s16[0]);
+}
+
+SIMD_INLINE uint64_t c_v64_hadd_u8(c_v64 a) {
+  return a.u8[7] + a.u8[6] + a.u8[5] + a.u8[4] + a.u8[3] + a.u8[2] + a.u8[1] +
+         a.u8[0];
+}
+
+SIMD_INLINE int64_t c_v64_hadd_s16(c_v64 a) {
+  return a.s16[3] + a.s16[2] + a.s16[1] + a.s16[0];
+}
+
+typedef struct {
+  uint32_t val;
+  int count;
+} c_sad64_internal;
+
+SIMD_INLINE c_sad64_internal c_v64_sad_u8_init(void) {
+  c_sad64_internal t;
+  t.val = t.count = 0;
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum(). The result for more than 32 v64_sad_u8() calls is
+   undefined. */
+SIMD_INLINE c_sad64_internal c_v64_sad_u8(c_sad64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s.val += a.u8[c] > b.u8[c] ? a.u8[c] - b.u8[c] : b.u8[c] - a.u8[c];
+  s.count++;
+  if (SIMD_CHECK && s.count > 32) {
+    fprintf(stderr,
+            "Error: sad called 32 times returning an undefined result\n");
+    abort();
+  }
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_sad_u8_sum(c_sad64_internal s) { return s.val; }
+
+typedef uint32_t c_ssd64_internal;
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8_init(void) { return 0; }
+
+SIMD_INLINE c_ssd64_internal c_v64_ssd_u8(c_ssd64_internal s, c_v64 a,
+                                          c_v64 b) {
+  int c;
+  for (c = 0; c < 8; c++) s += (a.u8[c] - b.u8[c]) * (a.u8[c] - b.u8[c]);
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v64_ssd_u8_sum(c_ssd64_internal s) { return s; }
+
+SIMD_INLINE c_v64 c_v64_or(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 | b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_xor(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 ^ b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_and(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_andn(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.u64 = a.u64 & ~b.u64;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (int16_t)(a.s16[c] * b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mulhi_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = (a.s16[c] * b.s16[c]) >> 16;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_mullo_s32(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = (int32_t)((int64_t)a.s32[0] * b.s32[0]);
+  t.s32[1] = (int32_t)((int64_t)a.s32[1] * b.s32[1]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  t.s32[0] = a.s16[0] * b.s16[0] + a.s16[1] * b.s16[1];
+  t.s32[1] = a.s16[2] * b.s16[2] + a.s16[3] * b.s16[3];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_madd_us8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int32_t u;
+  u = a.u8[0] * b.s8[0] + a.u8[1] * b.s8[1];
+  t.s16[0] = SIMD_CLAMP(u, -32768, 32767);
+  u = a.u8[2] * b.s8[2] + a.u8[3] * b.s8[3];
+  t.s16[1] = SIMD_CLAMP(u, -32768, 32767);
+  u = a.u8[4] * b.s8[4] + a.u8[5] * b.s8[5];
+  t.s16[2] = SIMD_CLAMP(u, -32768, 32767);
+  u = a.u8[6] * b.s8[6] + a.u8[7] * b.s8[7];
+  t.s16[3] = SIMD_CLAMP(u, -32768, 32767);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = (a.u8[c] + b.u8[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c] + 1) >> 1;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? b.u8[c] : a.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] > b.u8[c] ? a.u8[c] : b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? b.s8[c] : a.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] > b.s8[c] ? a.s8[c] : b.s8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_min_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? b.s16[c] : a.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_max_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] > b.s16[c] ? a.s16[c] : b.s16[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] > b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.s8[c] < b.s8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++) t.s8[c] = -(a.u8[c] == b.u8[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpgt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] > b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmplt_s16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.s16[c] < b.s16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_cmpeq_16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s16[c] = -(a.u16[c] == b.u16[c]);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = (int8_t)(a.u8[c] << n);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined u8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s8(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 7) {
+    fprintf(stderr, "Error: Undefined s8 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 8; c++) t.s8[c] = a.s8[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift left %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = (uint16_t)(a.u16[c] << n);
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: Undefined u16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.u16[c] = a.u16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s16(c_v64 a, unsigned int n) {
+  c_v64 t;
+  int c;
+  if (SIMD_CHECK && n > 15) {
+    fprintf(stderr, "Error: undefined s16 shift right %d\n", n);
+    abort();
+  }
+  for (c = 0; c < 4; c++) t.s16[c] = a.s16[c] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift left %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] << n;
+  t.u32[0] = a.u32[0] << n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_u32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined u32 shift right %d\n", n);
+    abort();
+  }
+  t.u32[1] = a.u32[1] >> n;
+  t.u32[0] = a.u32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_s32(c_v64 a, unsigned int n) {
+  c_v64 t;
+  if (SIMD_CHECK && n > 31) {
+    fprintf(stderr, "Error: undefined s32 shift right %d\n", n);
+    abort();
+  }
+  t.s32[1] = a.s32[1] >> n;
+  t.s32[0] = a.s32[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 >> i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_byte(c_v64 x, unsigned int i) {
+  c_v64 t;
+  t.u64 = x.u64 << i * 8;
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_align(c_v64 a, c_v64 b, unsigned int c) {
+  if (SIMD_CHECK && c > 7) {
+    fprintf(stderr, "Error: undefined alignment %d\n", c);
+    abort();
+  }
+  return c ? c_v64_or(c_v64_shr_n_byte(b, c), c_v64_shl_n_byte(a, 8 - c)) : b;
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_8(c_v64 a, unsigned int c) {
+  return c_v64_shl_8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u8(c_v64 a, unsigned int c) {
+  return c_v64_shr_u8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s8(c_v64 a, unsigned int c) {
+  return c_v64_shr_s8(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_16(c_v64 a, unsigned int c) {
+  return c_v64_shl_16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u16(c_v64 a, unsigned int c) {
+  return c_v64_shr_u16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s16(c_v64 a, unsigned int c) {
+  return c_v64_shr_s16(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shl_n_32(c_v64 a, unsigned int c) {
+  return c_v64_shl_32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_u32(c_v64 a, unsigned int c) {
+  return c_v64_shr_u32(a, c);
+}
+
+SIMD_INLINE c_v64 c_v64_shr_n_s32(c_v64 a, unsigned int c) {
+  return c_v64_shr_s32(a, c);
+}
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_C_H_
diff --git a/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
new file mode 100644
index 0000000000..ec27a6bf42
--- /dev/null
+++ b/third_party/aom/aom_dsp/simd/v64_intrinsics_x86.h
@@ -0,0 +1,489 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
+
+#include <emmintrin.h>
+#if defined(__SSSE3__)
+#include <tmmintrin.h>
+#endif
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#endif
+
+typedef __m128i v64;
+
+SIMD_INLINE uint32_t v64_low_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(a);
+}
+
+SIMD_INLINE uint32_t v64_high_u32(v64 a) {
+  return (uint32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE int32_t v64_low_s32(v64 a) { return (int32_t)_mm_cvtsi128_si32(a); }
+
+SIMD_INLINE int32_t v64_high_s32(v64 a) {
+  return (int32_t)_mm_cvtsi128_si32(_mm_srli_si128(a, 4));
+}
+
+SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) {
+  return _mm_packs_epi32(
+      _mm_set_epi32((int16_t)a, (int16_t)b, (int16_t)c, (int16_t)d),
+      _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) {
+  return _mm_set_epi32(0, 0, (int32_t)x, (int32_t)y);
+}
+
+SIMD_INLINE v64 v64_from_64(uint64_t x) {
+#ifdef __x86_64__
+  return _mm_cvtsi64_si128((int64_t)x);
+#else
+  return _mm_set_epi32(0, 0, (int32_t)(x >> 32), (int32_t)x);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_u64(v64 x) {
+  return (uint64_t)v64_low_u32(x) | ((uint64_t)v64_high_u32(x) << 32);
+}
+
+SIMD_INLINE uint32_t u32_load_aligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE uint32_t u32_load_unaligned(const void *p) {
+  return *((uint32_t *)p);
+}
+
+SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) {
+  *((uint32_t *)p) = a;
+}
+
+SIMD_INLINE v64 v64_load_aligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE v64 v64_load_unaligned(const void *p) {
+  return _mm_loadl_epi64((__m128i *)p);
+}
+
+SIMD_INLINE void v64_store_aligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+SIMD_INLINE void v64_store_unaligned(void *p, v64 a) {
+  _mm_storel_epi64((__m128i *)p, a);
+}
+
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
+#define v64_align(a, b, c) \
+  ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
+#else
+#define v64_align(a, b, c)                                                  \
+  ((c) ? v64_from_64((v64_u64(b) >> (c)*8) | (v64_u64(a) << (8 - (c)) * 8)) \
+       : (b))
+#endif
+
+SIMD_INLINE v64 v64_zero(void) { return _mm_setzero_si128(); }
+
+SIMD_INLINE v64 v64_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); }
+
+SIMD_INLINE v64 v64_dup_16(uint16_t x) { return _mm_set1_epi16((short)x); }
+
+SIMD_INLINE v64 v64_dup_32(uint32_t x) { return _mm_set1_epi32((int)x); }
+
+SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return _mm_add_epi8(a, b); }
+
+SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
+
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
+
+SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
+
+SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return _mm_sub_epi8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u8(v64 a, v64 b) { return _mm_subs_epu8(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s8(v64 a, v64 b) { return _mm_subs_epi8(a, b); }
+
+SIMD_INLINE v64 v64_sub_16(v64 a, v64 b) { return _mm_sub_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_s16(v64 a, v64 b) { return _mm_subs_epi16(a, b); }
+
+SIMD_INLINE v64 v64_ssub_u16(v64 a, v64 b) { return _mm_subs_epu16(a, b); }
+
+SIMD_INLINE v64 v64_sub_32(v64 a, v64 b) { return _mm_sub_epi32(a, b); }
+
+SIMD_INLINE v64 v64_abs_s16(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi16(a);
+#else
+  return _mm_max_epi16(a, _mm_sub_epi16(_mm_setzero_si128(), a));
+#endif
+}
+
+SIMD_INLINE v64 v64_abs_s8(v64 a) {
+#if defined(__SSSE3__)
+  return _mm_abs_epi8(a);
+#else
+  v64 sign = _mm_cmplt_epi8(a, _mm_setzero_si128());
+  return _mm_xor_si128(sign, _mm_add_epi8(a, sign));
+#endif
+}
+
+SIMD_INLINE v64 v64_ziplo_8(v64 a, v64 b) { return _mm_unpacklo_epi8(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_8(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_16(v64 a, v64 b) { return _mm_unpacklo_epi16(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_16(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_ziplo_32(v64 a, v64 b) { return _mm_unpacklo_epi32(b, a); }
+
+SIMD_INLINE v64 v64_ziphi_32(v64 a, v64 b) {
+  return _mm_srli_si128(_mm_unpacklo_epi32(b, a), 8);
+}
+
+SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi32(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  const int32_t ah = SIMD_CLAMP(v64_high_s32(a), 0, 65535);
+  const int32_t al = SIMD_CLAMP(v64_low_s32(a), 0, 65535);
+  const int32_t bh = SIMD_CLAMP(v64_high_s32(b), 0, 65535);
+  const int32_t bl = SIMD_CLAMP(v64_low_s32(b), 0, 65535);
+  return v64_from_16(ah, al, bh, bl);
+#endif
+}
+
+SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_pack_s16_s8(v64 a, v64 b) {
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packs_epi16(t, t);
+}
+
+SIMD_INLINE v64 v64_unziphi_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0d0b0907050301LL));
+#else
+  return _mm_packus_epi16(
+      _mm_unpacklo_epi64(_mm_srli_epi16(b, 8), _mm_srli_epi16(a, 8)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0e0c0a0806040200LL));
+#else
+  return v64_unziphi_8(_mm_slli_si128(a, 1), _mm_slli_si128(b, 1));
+#endif
+}
+
+SIMD_INLINE v64 v64_unziphi_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0f0e0b0a07060302LL));
+#else
+  return _mm_packs_epi32(
+      _mm_unpacklo_epi64(_mm_srai_epi32(b, 16), _mm_srai_epi32(a, 16)),
+      _mm_setzero_si128());
+#endif
+}
+
+SIMD_INLINE v64 v64_unziplo_16(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(_mm_unpacklo_epi64(b, a),
+                          v64_from_64(0x0d0c090805040100LL));
+#else
+  return v64_unziphi_16(_mm_slli_si128(a, 2), _mm_slli_si128(b, 2));
+#endif
+}
+
+SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
+  return _mm_unpacklo_epi8(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi8(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) {
+  return _mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) {
+  return _mm_srli_si128(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8), 8);
+}
+
+SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 a) {
+  return _mm_unpacklo_epi16(a, _mm_setzero_si128());
+}
+
+SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 a) {
+  return _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16);
+}
+
+SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 a) {
+  return _mm_srli_si128(_mm_unpacklo_epi16(a, _mm_setzero_si128()), 8);
+}
+
+SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 a) {
+  return _mm_srli_si128(
+      _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), a), 16), 8);
+}
+
+SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) {
+#if defined(__SSSE3__)
+  return _mm_shuffle_epi8(x, pattern);
+#else
+  v64 output;
+  unsigned char *input = (unsigned char *)&x;
+  unsigned char *index = (unsigned char *)&pattern;
+  unsigned char *selected = (unsigned char *)&output;
+  int counter;
+
+  for (counter = 0; counter < 8; counter++) {
+    selected[counter] = input[index[counter]];
+  }
+
+  return output;
+#endif
+}
+
+SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
+}
+
+SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
+  __m128i r = _mm_madd_epi16(a, b);
+#if defined(__SSE4_1__) && defined(__x86_64__)
+  __m128i x = _mm_cvtepi32_epi64(r);
+  return _mm_cvtsi128_si64(_mm_add_epi64(x, _mm_srli_si128(x, 8)));
+#else
+  return (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(r);
+#endif
+}
+
+SIMD_INLINE uint64_t v64_hadd_u8(v64 a) {
+  return v64_low_u32(_mm_sad_epu8(a, _mm_setzero_si128()));
+}
+
+SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
+  return v64_dotp_s16(a, v64_dup_16(1));
+}
+
+typedef v64 sad64_internal;
+
+SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v64_sad_u8_sum().
+   The result for more than 32 v64_sad_u8() calls is undefined. */
+SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
+  return _mm_add_epi64(s, _mm_sad_epu8(a, b));
+}
+
+SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+typedef v64 ssd64_internal;
+
+SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return _mm_setzero_si128(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v64_ssd_u8_sum(). */
+SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
+  v64 l = v64_sub_16(v64_ziplo_8(v64_zero(), a), v64_ziplo_8(v64_zero(), b));
+  v64 h = v64_sub_16(v64_ziphi_8(v64_zero(), a), v64_ziphi_8(v64_zero(), b));
+  v64 r = v64_add_32(_mm_madd_epi16(l, l), _mm_madd_epi16(h, h));
+  return _mm_add_epi64(
+      s, v64_ziplo_32(v64_zero(), _mm_add_epi32(r, _mm_srli_si128(r, 4))));
+}
+
+SIMD_INLINE uint32_t v64_ssd_u8_sum(sad64_internal s) { return v64_low_u32(s); }
+
+SIMD_INLINE v64 v64_or(v64 a, v64 b) { return _mm_or_si128(a, b); }
+
+SIMD_INLINE v64 v64_xor(v64 a, v64 b) { return _mm_xor_si128(a, b); }
+
+SIMD_INLINE v64 v64_and(v64 a, v64 b) { return _mm_and_si128(a, b); }
+
+SIMD_INLINE v64 v64_andn(v64 a, v64 b) { return _mm_andnot_si128(b, a); }
+
+SIMD_INLINE v64 v64_mullo_s16(v64 a, v64 b) { return _mm_mullo_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mulhi_s16(v64 a, v64 b) { return _mm_mulhi_epi16(a, b); }
+
+SIMD_INLINE v64 v64_mullo_s32(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_mullo_epi32(a, b);
+#else
+  return _mm_unpacklo_epi32(
+      _mm_mul_epu32(a, b),
+      _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)));
+#endif
+}
+
+SIMD_INLINE v64 v64_madd_s16(v64 a, v64 b) { return _mm_madd_epi16(a, b); }
+
+SIMD_INLINE v64 v64_madd_us8(v64 a, v64 b) {
+#if defined(__SSSE3__)
+  return _mm_maddubs_epi16(a, b);
+#else
+  __m128i t = _mm_madd_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
+                             _mm_srai_epi16(_mm_unpacklo_epi8(b, b), 8));
+  return _mm_packs_epi32(t, t);
+#endif
+}
+
+SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return _mm_avg_epu8(a, b); }
+
+SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) {
+  return _mm_sub_epi8(_mm_avg_epu8(a, b),
+                      _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
+}
+
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
+SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
+
+SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
+
+SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return _mm_max_epu8(a, b); }
+
+SIMD_INLINE v64 v64_min_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_max_s8(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi8(a, b);
+#else
+  v64 mask = _mm_cmplt_epi8(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v64 v64_min_s16(v64 a, v64 b) { return _mm_min_epi16(a, b); }
+
+SIMD_INLINE v64 v64_max_s16(v64 a, v64 b) { return _mm_max_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s8(v64 a, v64 b) { return _mm_cmpgt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s8(v64 a, v64 b) { return _mm_cmplt_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_8(v64 a, v64 b) { return _mm_cmpeq_epi8(a, b); }
+
+SIMD_INLINE v64 v64_cmpgt_s16(v64 a, v64 b) { return _mm_cmpgt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmplt_s16(v64 a, v64 b) { return _mm_cmplt_epi16(a, b); }
+
+SIMD_INLINE v64 v64_cmpeq_16(v64 a, v64 b) { return _mm_cmpeq_epi16(a, b); }
+
+SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff << c)),
+                       _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) {
+  return _mm_and_si128(_mm_set1_epi8((char)(0xff >> c)),
+                       _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c)));
+}
+
+SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) {
+  return _mm_packs_epi16(
+      _mm_sra_epi16(_mm_unpacklo_epi8(a, a), _mm_cvtsi32_si128((int)(c + 8))),
+      a);
+}
+
+SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) {
+  return _mm_sll_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) {
+  return _mm_srl_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) {
+  return _mm_sra_epi16(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) {
+  return _mm_sll_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) {
+  return _mm_srl_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) {
+  return _mm_sra_epi32(a, _mm_cvtsi32_si128((int)c));
+}
+
+/* These intrinsics require immediate values, so we must use #defines
+   to enforce that. */
+#define v64_shl_n_byte(a, c) _mm_slli_si128(a, c)
+#define v64_shr_n_byte(a, c) _mm_srli_si128(_mm_unpacklo_epi64(a, a), c + 8)
+#define v64_shl_n_8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((char)(0xff << (c))), _mm_slli_epi16(a, c))
+#define v64_shr_n_u8(a, c) \
+  _mm_and_si128(_mm_set1_epi8((char)(0xff >> (c))), _mm_srli_epi16(a, c))
+#define v64_shr_n_s8(a, c) \
+  _mm_packs_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), (c) + 8), a)
+#define v64_shl_n_16(a, c) _mm_slli_epi16(a, c)
+#define v64_shr_n_u16(a, c) _mm_srli_epi16(a, c)
+#define v64_shr_n_s16(a, c) _mm_srai_epi16(a, c)
+#define v64_shl_n_32(a, c) _mm_slli_epi32(a, c)
+#define v64_shr_n_u32(a, c) _mm_srli_epi32(a, c)
+#define v64_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+
+#endif  // AOM_AOM_DSP_SIMD_V64_INTRINSICS_X86_H_
diff --git a/third_party/aom/aom_dsp/sse.c b/third_party/aom/aom_dsp/sse.c
new file mode 100644
index 0000000000..bfe76edc39
--- /dev/null
+++ b/third_party/aom/aom_dsp/sse.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+/*
+ * Sum the square of the difference between every corresponding element of the
+ * buffers.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+int64_t aom_sse_c(const uint8_t *a, int a_stride, const uint8_t *b,
+                  int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = abs(a[x] - b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+int64_t aom_highbd_sse_c(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                         int b_stride, int width, int height) {
+  int y, x;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++) {
+      const int32_t diff = (int32_t)(a[x]) - (int32_t)(b[x]);
+      sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sse;
+}
+#endif
diff --git a/third_party/aom/aom_dsp/ssim.c b/third_party/aom/aom_dsp/ssim.c
new file mode 100644
index 0000000000..35d493b038
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.c
@@ -0,0 +1,481 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <math.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/ssim.h"
+#include "aom_ports/mem.h"
+
+#if CONFIG_INTERNAL_STATS
+void aom_ssim_parms_16x16_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                            uint32_t *sum_s, uint32_t *sum_r,
+                            uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                            uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 16; i++, s += sp, r += rp) {
+    for (j = 0; j < 16; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+void aom_ssim_parms_8x8_c(const uint8_t *s, int sp, const uint8_t *r, int rp,
+                          uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s,
+                          uint32_t *sum_sq_r, uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static const int64_t cc1 = 26634;        // (64^2*(.01*255)^2
+static const int64_t cc2 = 239708;       // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;    // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;   // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;   // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
+
+static double similarity(uint32_t sum_s, uint32_t sum_r, uint32_t sum_sq_s,
+                         uint32_t sum_sq_r, uint32_t sum_sxr, int count,
+                         uint32_t bd) {
+  double ssim_n, ssim_d;
+  int64_t c1 = 0, c2 = 0;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    assert(0);
+    // Return similarity as zero for unsupported bit-depth values.
+    return 0;
+  }
+
+  ssim_n = (2.0 * sum_s * sum_r + c1) *
+           (2.0 * count * sum_sxr - 2.0 * sum_s * sum_r + c2);
+
+  ssim_d = ((double)sum_s * sum_s + (double)sum_r * sum_r + c1) *
+           ((double)count * sum_sq_s - (double)sum_s * sum_s +
+            (double)count * sum_sq_r - (double)sum_r * sum_r + c2);
+
+  return ssim_n / ssim_d;
+}
+
+static double ssim_8x8(const uint8_t *s, int sp, const uint8_t *r, int rp) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                     &sum_sxr);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = ssim_8x8(img1 + j, stride_img1, img2 + j, stride_img2);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim) {
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_ssim2(source->buffers[i], dest->buffers[i],
+                       source->strides[is_uv], dest->strides[is_uv],
+                       source->crop_widths[is_uv], source->crop_heights[is_uv]);
+  }
+
+  *weight = 1;
+  *fast_ssim = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+}
+
+// traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
+//
+// Re working out the math ->
+//
+// ssim(x,y) =  (2*mean(x)*mean(y) + c1)*(2*cov(x,y)+c2) /
+//   ((mean(x)^2+mean(y)^2+c1)*(var(x)+var(y)+c2))
+//
+// mean(x) = sum(x) / n
+//
+// cov(x,y) = (n*sum(xi*yi)-sum(x)*sum(y))/(n*n)
+//
+// var(x) = (n*sum(xi*xi)-sum(xi)*sum(xi))/(n*n)
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y)/(n*n) + c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))/(n*n)+c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y))/(n*n) +c1) *
+//    ((n*sum(xi*xi) - sum(xi)*sum(xi))/(n*n)+
+//     (n*sum(yi*yi) - sum(yi)*sum(yi))/(n*n)+c2)))
+//
+// factoring out n*n
+//
+// ssim(x,y) =
+//   (2*sum(x)*sum(y) + n*n*c1)*(2*(n*sum(xi*yi)-sum(x)*sum(y))+n*n*c2) /
+//   (((sum(x)*sum(x)+sum(y)*sum(y)) + n*n*c1) *
+//    (n*sum(xi*xi)-sum(xi)*sum(xi)+n*sum(yi*yi)-sum(yi)*sum(yi)+n*n*c2))
+//
+// Replace c1 with n*n * c1 for the final step that leads to this code:
+// The final step scales by 12 bits so we don't lose precision in the constants.
+
+static double ssimv_similarity(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double l = 1.0 * (2 * sv->sum_s * sv->sum_r + c1) /
+                   (sv->sum_s * sv->sum_s + sv->sum_r * sv->sum_r + c1);
+
+  // Since these variables are unsigned sums, convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+
+// The first term of the ssim metric is a luminance factor.
+//
+// (2*mean(x)*mean(y) + c1)/ (mean(x)^2+mean(y)^2+c1)
+//
+// This luminance factor is super sensitive to the dark side of luminance
+// values and completely insensitive on the white side.  check out 2 sets
+// (1,3) and (250,252) the term gives ( 2*1*3/(1+9) = .60
+// 2*250*252/ (250^2+252^2) => .99999997
+//
+// As a result in this tweaked version of the calculation in which the
+// luminance is taken as percentage off from peak possible.
+//
+// 255 * 255 - (sum_s - sum_r) / count * (sum_s - sum_r) / count
+//
+static double ssimv_similarity2(const Ssimv *sv, int64_t n) {
+  // Scale the constants by number of pixels.
+  const int64_t c1 = (cc1 * n * n) >> 12;
+  const int64_t c2 = (cc2 * n * n) >> 12;
+
+  const double mean_diff = (1.0 * sv->sum_s - sv->sum_r) / n;
+  const double l = (255 * 255 - mean_diff * mean_diff + c1) / (255 * 255 + c1);
+
+  // Since these variables are unsigned, sums convert to double so
+  // math is done in double arithmetic.
+  const double v = (2.0 * n * sv->sum_sxr - 2 * sv->sum_s * sv->sum_r + c2) /
+                   (n * sv->sum_sq_s - sv->sum_s * sv->sum_s +
+                    n * sv->sum_sq_r - sv->sum_r * sv->sum_r + c2);
+
+  return l * v;
+}
+static void ssimv_parms(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                        int img2_pitch, Ssimv *sv) {
+  aom_ssim_parms_8x8(img1, img1_pitch, img2, img2_pitch, &sv->sum_s, &sv->sum_r,
+                     &sv->sum_sq_s, &sv->sum_sq_r, &sv->sum_sxr);
+}
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency) {
+  double dssim_total = 0;
+  double ssim_total = 0;
+  double ssim2_total = 0;
+  double inconsistency_total = 0;
+  int i, j;
+  int c = 0;
+  double norm;
+  double old_ssim_total = 0;
+  // We can sample points as frequently as we like start with 1 per 4x4.
+  for (i = 0; i < height;
+       i += 4, img1 += img1_pitch * 4, img2 += img2_pitch * 4) {
+    for (j = 0; j < width; j += 4, ++c) {
+      Ssimv sv = { 0, 0, 0, 0, 0, 0 };
+      double ssim;
+      double ssim2;
+      double dssim;
+      uint32_t var_new;
+      uint32_t var_old;
+      uint32_t mean_new;
+      uint32_t mean_old;
+      double ssim_new;
+      double ssim_old;
+
+      // Not sure there's a great way to handle the edge pixels
+      // in ssim when using a window. Seems biased against edge pixels
+      // however you handle this. This uses only samples that are
+      // fully in the frame.
+      if (j + 8 <= width && i + 8 <= height) {
+        ssimv_parms(img1 + j, img1_pitch, img2 + j, img2_pitch, &sv);
+      }
+
+      ssim = ssimv_similarity(&sv, 64);
+      ssim2 = ssimv_similarity2(&sv, 64);
+
+      sv.ssim = ssim2;
+
+      // dssim is calculated to use as an actual error metric and
+      // is scaled up to the same range as sum square error.
+      // Since we are subsampling every 16th point maybe this should be
+      // *16 ?
+      dssim = 255 * 255 * (1 - ssim2) / 2;
+
+      // Here I introduce a new error metric: consistency-weighted
+      // SSIM-inconsistency.  This metric isolates frames where the
+      // SSIM 'suddenly' changes, e.g. if one frame in every 8 is much
+      // sharper or blurrier than the others. Higher values indicate a
+      // temporally inconsistent SSIM. There are two ideas at work:
+      //
+      // 1) 'SSIM-inconsistency': the total inconsistency value
+      // reflects how much SSIM values are changing between this
+      // source / reference frame pair and the previous pair.
+      //
+      // 2) 'consistency-weighted': weights de-emphasize areas in the
+      // frame where the scene content has changed. Changes in scene
+      // content are detected via changes in local variance and local
+      // mean.
+      //
+      // Thus the overall measure reflects how inconsistent the SSIM
+      // values are, over consistent regions of the frame.
+      //
+      // The metric has three terms:
+      //
+      // term 1 -> uses change in scene Variance to weight error score
+      //  2 * var(Fi)*var(Fi-1) / (var(Fi)^2+var(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term 2 -> uses change in local scene luminance to weight error
+      //  2 * avg(Fi)*avg(Fi-1) / (avg(Fi)^2+avg(Fi-1)^2)
+      //  larger changes from one frame to the next mean we care
+      //  less about consistency.
+      //
+      // term3 -> measures inconsistency in ssim scores between frames
+      //   1 - ( 2 * ssim(Fi)*ssim(Fi-1)/(ssim(Fi)^2+sssim(Fi-1)^2).
+      //
+      // This term compares the ssim score for the same location in 2
+      // subsequent frames.
+      var_new = sv.sum_sq_s - sv.sum_s * sv.sum_s / 64;
+      var_old = sv2[c].sum_sq_s - sv2[c].sum_s * sv2[c].sum_s / 64;
+      mean_new = sv.sum_s;
+      mean_old = sv2[c].sum_s;
+      ssim_new = sv.ssim;
+      ssim_old = sv2[c].ssim;
+
+      if (do_inconsistency) {
+        // We do the metric once for every 4x4 block in the image. Since
+        // we are scaling the error to SSE for use in a psnr calculation
+        // 1.0 = 4x4x255x255 the worst error we can possibly have.
+        static const double kScaling = 4. * 4 * 255 * 255;
+
+        // The constants have to be non 0 to avoid potential divide by 0
+        // issues other than that they affect kind of a weighting between
+        // the terms.  No testing of what the right terms should be has been
+        // done.
+        static const double c1 = 1, c2 = 1, c3 = 1;
+
+        // This measures how much consistent variance is in two consecutive
+        // source frames. 1.0 means they have exactly the same variance.
+        const double variance_term =
+            (2.0 * var_old * var_new + c1) /
+            (1.0 * var_old * var_old + 1.0 * var_new * var_new + c1);
+
+        // This measures how consistent the local mean are between two
+        // consecutive frames. 1.0 means they have exactly the same mean.
+        const double mean_term =
+            (2.0 * mean_old * mean_new + c2) /
+            (1.0 * mean_old * mean_old + 1.0 * mean_new * mean_new + c2);
+
+        // This measures how consistent the ssims of two
+        // consecutive frames is. 1.0 means they are exactly the same.
+        double ssim_term =
+            pow((2.0 * ssim_old * ssim_new + c3) /
+                    (ssim_old * ssim_old + ssim_new * ssim_new + c3),
+                5);
+
+        double this_inconsistency;
+
+        // Floating point math sometimes makes this > 1 by a tiny bit.
+        // We want the metric to scale between 0 and 1.0 so we can convert
+        // it to an snr scaled value.
+        if (ssim_term > 1) ssim_term = 1;
+
+        // This converts the consistency metric to an inconsistency metric
+        // ( so we can scale it like psnr to something like sum square error.
+        // The reason for the variance and mean terms is the assumption that
+        // if there are big changes in the source we shouldn't penalize
+        // inconsistency in ssim scores a bit less as it will be less visible
+        // to the user.
+        this_inconsistency = (1 - ssim_term) * variance_term * mean_term;
+
+        this_inconsistency *= kScaling;
+        inconsistency_total += this_inconsistency;
+      }
+      sv2[c] = sv;
+      ssim_total += ssim;
+      ssim2_total += ssim2;
+      dssim_total += dssim;
+
+      old_ssim_total += ssim_old;
+    }
+    old_ssim_total += 0;
+  }
+
+  norm = 1. / (width / 4) / (height / 4);
+  ssim_total *= norm;
+  ssim2_total *= norm;
+  m->ssim2 = ssim2_total;
+  m->ssim = ssim_total;
+  if (old_ssim_total == 0) inconsistency_total = 0;
+
+  m->ssimc = inconsistency_total;
+
+  m->dssim = dssim_total;
+  return inconsistency_total;
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_ssim_parms_8x8_c(const uint16_t *s, int sp, const uint16_t *r,
+                                 int rp, uint32_t *sum_s, uint32_t *sum_r,
+                                 uint32_t *sum_sq_s, uint32_t *sum_sq_r,
+                                 uint32_t *sum_sxr) {
+  int i, j;
+  for (i = 0; i < 8; i++, s += sp, r += rp) {
+    for (j = 0; j < 8; j++) {
+      *sum_s += s[j];
+      *sum_r += r[j];
+      *sum_sq_s += s[j] * s[j];
+      *sum_sq_r += r[j] * r[j];
+      *sum_sxr += s[j] * r[j];
+    }
+  }
+}
+
+static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
+                              int rp, uint32_t bd, uint32_t shift) {
+  uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
+  aom_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
+                            &sum_sxr);
+  return similarity(sum_s >> shift, sum_r >> shift, sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift), sum_sxr >> (2 * shift), 64, bd);
+}
+
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift) {
+  int i, j;
+  int samples = 0;
+  double ssim_total = 0;
+
+  // sample point start with each 4x4 location
+  for (i = 0; i <= height - 8;
+       i += 4, img1 += stride_img1 * 4, img2 += stride_img2 * 4) {
+    for (j = 0; j <= width - 8; j += 4) {
+      double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
+                                 CONVERT_TO_SHORTPTR(img2 + j), stride_img2, bd,
+                                 shift);
+      ssim_total += v;
+      samples++;
+    }
+  }
+  ssim_total /= samples;
+  return ssim_total;
+}
+
+#if CONFIG_INTERNAL_STATS
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim) {
+  assert(bd >= in_bd);
+  uint32_t shift = bd - in_bd;
+
+  double abc[3];
+  for (int i = 0; i < 3; ++i) {
+    const int is_uv = i > 0;
+    abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                              source->strides[is_uv], dest->strides[is_uv],
+                              source->crop_widths[is_uv],
+                              source->crop_heights[is_uv], in_bd, shift);
+  }
+
+  weight[0] = 1;
+  fast_ssim[0] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+
+  if (bd > in_bd) {
+    // Compute SSIM based on stream bit depth
+    shift = 0;
+    for (int i = 0; i < 3; ++i) {
+      const int is_uv = i > 0;
+      abc[i] = aom_highbd_ssim2(source->buffers[i], dest->buffers[i],
+                                source->strides[is_uv], dest->strides[is_uv],
+                                source->crop_widths[is_uv],
+                                source->crop_heights[is_uv], bd, shift);
+    }
+
+    weight[1] = 1;
+    fast_ssim[1] = abc[0] * .8 + .1 * (abc[1] + abc[2]);
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if CONFIG_INTERNAL_STATS
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2) {
+#if CONFIG_AV1_HIGHBITDEPTH
+  if (is_hbd) {
+    aom_highbd_calc_ssim(orig, recon, weight, bit_depth, in_bit_depth,
+                         frame_ssim2);
+    return;
+  }
+#else
+  (void)bit_depth;
+  (void)in_bit_depth;
+  (void)is_hbd;
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+  aom_lowbd_calc_ssim(orig, recon, weight, frame_ssim2);
+}
+#endif  // CONFIG_INTERNAL_STATS
diff --git a/third_party/aom/aom_dsp/ssim.h b/third_party/aom/aom_dsp/ssim.h
new file mode 100644
index 0000000000..fb92556a8c
--- /dev/null
+++ b/third_party/aom/aom_dsp/ssim.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_SSIM_H_
+#define AOM_AOM_DSP_SSIM_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "config/aom_config.h"
+
+#if CONFIG_INTERNAL_STATS
+#include "aom_scale/yv12config.h"
+
+// metrics used for calculating ssim, ssim2, dssim, and ssimc
+typedef struct {
+  // source sum ( over 8x8 region )
+  uint32_t sum_s;
+
+  // reference sum (over 8x8 region )
+  uint32_t sum_r;
+
+  // source sum squared ( over 8x8 region )
+  uint32_t sum_sq_s;
+
+  // reference sum squared (over 8x8 region )
+  uint32_t sum_sq_r;
+
+  // sum of source times reference (over 8x8 region)
+  uint32_t sum_sxr;
+
+  // calculated ssim score between source and reference
+  double ssim;
+} Ssimv;
+
+// metrics collected on a frame basis
+typedef struct {
+  // ssim consistency error metric ( see code for explanation )
+  double ssimc;
+
+  // standard ssim
+  double ssim;
+
+  // revised ssim ( see code for explanation)
+  double ssim2;
+
+  // ssim restated as an error metric like sse
+  double dssim;
+
+  // dssim converted to decibels
+  double dssimd;
+
+  // ssimc converted to decibels
+  double ssimcd;
+} Metrics;
+
+double aom_get_ssim_metrics(uint8_t *img1, int img1_pitch, uint8_t *img2,
+                            int img2_pitch, int width, int height, Ssimv *sv2,
+                            Metrics *m, int do_inconsistency);
+
+void aom_lowbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *weight,
+                         double *fast_ssim);
+
+double aom_calc_fastssim(const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *dest, double *ssim_y,
+                         double *ssim_u, double *ssim_v, uint32_t bd,
+                         uint32_t in_bd);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
+                          const YV12_BUFFER_CONFIG *dest, double *weight,
+                          uint32_t bd, uint32_t in_bd, double *fast_ssim);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_calc_ssim(const YV12_BUFFER_CONFIG *orig,
+                   const YV12_BUFFER_CONFIG *recon, const uint32_t bit_depth,
+                   const uint32_t in_bit_depth, int is_hbd, double *weight,
+                   double *frame_ssim2);
+#endif  // CONFIG_INTERNAL_STATS
+
+double aom_ssim2(const uint8_t *img1, const uint8_t *img2, int stride_img1,
+                 int stride_img2, int width, int height);
+
+#if CONFIG_AV1_HIGHBITDEPTH
+double aom_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
+                        int stride_img1, int stride_img2, int width, int height,
+                        uint32_t bd, uint32_t shift);
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_SSIM_H_
diff --git a/third_party/aom/aom_dsp/subtract.c b/third_party/aom/aom_dsp/subtract.c
new file mode 100644
index 0000000000..4f47e553d4
--- /dev/null
+++ b/third_party/aom/aom_dsp/subtract.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+void aom_subtract_block_c(int rows, int cols, int16_t *diff,
+                          ptrdiff_t diff_stride, const uint8_t *src,
+                          ptrdiff_t src_stride, const uint8_t *pred,
+                          ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_subtract_block_c(int rows, int cols, int16_t *diff,
+                                 ptrdiff_t diff_stride, const uint8_t *src8,
+                                 ptrdiff_t src_stride, const uint8_t *pred8,
+                                 ptrdiff_t pred_stride) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src += src_stride;
+  }
+}
+#endif
diff --git a/third_party/aom/aom_dsp/sum_squares.c b/third_party/aom/aom_dsp/sum_squares.c
new file mode 100644
index 0000000000..f58defaa11
--- /dev/null
+++ b/third_party/aom/aom_dsp/sum_squares.c
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+uint64_t aom_sum_squares_2d_i16_c(const int16_t *src, int src_stride, int width,
+                                  int height) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = src[c];
+      ss += v * v;
+    }
+    src += src_stride;
+  }
+
+  return ss;
+}
+
+uint64_t aom_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+  uint64_t ss = 0;
+  do {
+    const int16_t v = *src++;
+    ss += v * v;
+  } while (--n);
+
+  return ss;
+}
+
+uint64_t aom_var_2d_u8_c(uint8_t *src, int src_stride, int width, int height) {
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint8_t v = src[c];
+      ss += v * v;
+      s += v;
+    }
+    src += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_c(uint8_t *src, int src_stride, int width, int height) {
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  int r, c;
+  uint64_t ss = 0, s = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const uint16_t v = srcp[c];
+      ss += v * v;
+      s += v;
+    }
+    srcp += src_stride;
+  }
+
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_sum_sse_2d_i16_c(const int16_t *src, int src_stride, int width,
+                              int height, int *sum) {
+  int r, c;
+  int16_t *srcp = (int16_t *)src;
+  int64_t ss = 0;
+
+  for (r = 0; r < height; r++) {
+    for (c = 0; c < width; c++) {
+      const int16_t v = srcp[c];
+      ss += v * v;
+      *sum += v;
+    }
+    srcp += src_stride;
+  }
+  return ss;
+}
diff --git a/third_party/aom/aom_dsp/txfm_common.h b/third_party/aom/aom_dsp/txfm_common.h
new file mode 100644
index 0000000000..67d9e90ca9
--- /dev/null
+++ b/third_party/aom/aom_dsp/txfm_common.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_TXFM_COMMON_H_
+#define AOM_AOM_DSP_TXFM_COMMON_H_
+
+#include "aom_dsp/aom_dsp_common.h"
+
+// Constants and Macros used by all idct/dct functions
+#define DCT_CONST_BITS 14
+#define DCT_CONST_ROUNDING (1 << (DCT_CONST_BITS - 1))
+
+#define UNIT_QUANT_SHIFT 2
+#define UNIT_QUANT_FACTOR (1 << UNIT_QUANT_SHIFT)
+
+// block transform size
+enum {
+  TX_4X4,             // 4x4 transform
+  TX_8X8,             // 8x8 transform
+  TX_16X16,           // 16x16 transform
+  TX_32X32,           // 32x32 transform
+  TX_64X64,           // 64x64 transform
+  TX_4X8,             // 4x8 transform
+  TX_8X4,             // 8x4 transform
+  TX_8X16,            // 8x16 transform
+  TX_16X8,            // 16x8 transform
+  TX_16X32,           // 16x32 transform
+  TX_32X16,           // 32x16 transform
+  TX_32X64,           // 32x64 transform
+  TX_64X32,           // 64x32 transform
+  TX_4X16,            // 4x16 transform
+  TX_16X4,            // 16x4 transform
+  TX_8X32,            // 8x32 transform
+  TX_32X8,            // 32x8 transform
+  TX_16X64,           // 16x64 transform
+  TX_64X16,           // 64x16 transform
+  TX_SIZES_ALL,       // Includes rectangular transforms
+  TX_SIZES = TX_4X8,  // Does NOT include rectangular transforms
+  TX_SIZES_LARGEST = TX_64X64,
+  TX_INVALID = 255  // Invalid transform size
+} UENUM1BYTE(TX_SIZE);
+
+enum {
+  DCT_DCT,            // DCT in both horizontal and vertical
+  ADST_DCT,           // ADST in vertical, DCT in horizontal
+  DCT_ADST,           // DCT in vertical, ADST in horizontal
+  ADST_ADST,          // ADST in both directions
+  FLIPADST_DCT,       // FLIPADST in vertical, DCT in horizontal
+  DCT_FLIPADST,       // DCT in vertical, FLIPADST in horizontal
+  FLIPADST_FLIPADST,  // FLIPADST in both directions
+  ADST_FLIPADST,      // ADST in vertical, FLIPADST in horizontal
+  FLIPADST_ADST,      // FLIPADST in vertical, ADST in horizontal
+  IDTX,               // Identity in both directions
+  V_DCT,              // DCT in vertical, identity in horizontal
+  H_DCT,              // Identity in vertical, DCT in horizontal
+  V_ADST,             // ADST in vertical, identity in horizontal
+  H_ADST,             // Identity in vertical, ADST in horizontal
+  V_FLIPADST,         // FLIPADST in vertical, identity in horizontal
+  H_FLIPADST,         // Identity in vertical, FLIPADST in horizontal
+  TX_TYPES,
+  DCT_ADST_TX_MASK = 0x000F,  // Either DCT or ADST in each direction
+  TX_TYPE_INVALID = 255,      // Invalid transform type
+} UENUM1BYTE(TX_TYPE);
+
+enum {
+  // DCT only
+  EXT_TX_SET_DCTONLY,
+  // DCT + Identity only
+  EXT_TX_SET_DCT_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1)
+  EXT_TX_SET_DTT4_IDTX,
+  // Discrete Trig transforms w/o flip (4) + Identity (1) + 1D Hor/vert DCT (2)
+  EXT_TX_SET_DTT4_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver DCT (2)
+  EXT_TX_SET_DTT9_IDTX_1DDCT,
+  // Discrete Trig transforms w/ flip (9) + Identity (1) + 1D Hor/Ver (6)
+  EXT_TX_SET_ALL16,
+  EXT_TX_SET_TYPES
+} UENUM1BYTE(TxSetType);
+
+typedef struct txfm_param {
+  // for both forward and inverse transforms
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int lossless;
+  int bd;
+  // are the pixel buffers octets or shorts?  This should collapse to
+  // bd==8 implies !is_hbd, but that's not certain right now.
+  int is_hbd;
+  TxSetType tx_set_type;
+  // for inverse transforms only
+  int eob;
+} TxfmParam;
+
+// Constants:
+//  for (int i = 1; i< 32; ++i)
+//    printf("static const int cospi_%d_64 = %.0f;\n", i,
+//           round(16384 * cos(i*PI/64)));
+// Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
+static const tran_high_t cospi_1_64 = 16364;
+static const tran_high_t cospi_2_64 = 16305;
+static const tran_high_t cospi_3_64 = 16207;
+static const tran_high_t cospi_4_64 = 16069;
+static const tran_high_t cospi_5_64 = 15893;
+static const tran_high_t cospi_6_64 = 15679;
+static const tran_high_t cospi_7_64 = 15426;
+static const tran_high_t cospi_8_64 = 15137;
+static const tran_high_t cospi_9_64 = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
+
+//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;
+
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  return rv;
+}
+
+#endif  // AOM_AOM_DSP_TXFM_COMMON_H_
diff --git a/third_party/aom/aom_dsp/variance.c b/third_party/aom/aom_dsp/variance.c
new file mode 100644
index 0000000000..f02c3077ae
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.c
@@ -0,0 +1,1234 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/variance.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+
+uint32_t aom_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+static void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h) {
+  uint32_t sse;
+  int sum;
+  variance(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b,
+                                             unsigned int src_pixels_per_line,
+                                             unsigned int pixel_step,
+                                             unsigned int output_height,
+                                             unsigned int output_width,
+                                             const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              unsigned int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO(
+          (int)a[0] * filter[0] + (int)a[pixel_step] * filter[1], FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+#define VAR(W, H)                                                    \
+  uint32_t aom_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                     const uint8_t *b, int b_stride, \
+                                     uint32_t *sse) {                \
+    int sum;                                                         \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));        \
+  }
+
+#define SUBPIX_VAR(W, H)                                                      \
+  uint32_t aom_sub_pixel_variance##W##x##H##_c(                               \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,               \
+      const uint8_t *b, int b_stride, uint32_t *sse) {                        \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint8_t temp2[H * W];                                                     \
+                                                                              \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \
+                                            bilinear_filters_2t[xoffset]);    \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,       \
+                                             bilinear_filters_2t[yoffset]);   \
+                                                                              \
+    return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse);             \
+  }
+
+#define SUBPIX_AVG_VAR(W, H)                                                   \
+  uint32_t aom_sub_pixel_avg_variance##W##x##H##_c(                            \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred) {                                            \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W);                     \
+                                                                               \
+    return aom_variance##W##x##H##_c(temp3, W, b, b_stride, sse);              \
+  }                                                                            \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(                   \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,                \
+      const uint8_t *b, int b_stride, uint32_t *sse,                           \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W,  \
+                                            bilinear_filters_2t[xoffset]);     \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \
+                                                                               \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);                  \
+  }
+
+void aom_get_var_sse_sum_8x8_quad_c(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    uint32_t *sse8x8, int *sum8x8,
+                                    unsigned int *tot_sse, int *tot_sum,
+                                    uint32_t *var8x8) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    variance(a + (k * 8), a_stride, b + (k * 8), b_stride, 8, 8, &sse8x8[k],
+             &sum8x8[k]);
+  }
+
+  // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++)
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_c(const uint8_t *src_ptr, int source_stride,
+                                      const uint8_t *ref_ptr, int ref_stride,
+                                      uint32_t *sse16x16, unsigned int *tot_sse,
+                                      int *tot_sum, uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over two consecutive 16x16 blocks and process as one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    variance(src_ptr + (k * 16), source_stride, ref_ptr + (k * 16), ref_stride,
+             16, 16, &sse16x16[k], &sum16x16[k]);
+  }
+
+  // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++)
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H)                                               \
+  uint32_t aom_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                const uint8_t *b, int b_stride, \
+                                uint32_t *sse) {                \
+    int sum;                                                    \
+    variance(a, a_stride, b, b_stride, W, H, sse, &sum);        \
+    return *sse;                                                \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+  VAR(W, H)             \
+  SUBPIX_VAR(W, H)      \
+  SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+// Realtime mode doesn't use rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+VARIANCES(4, 16)
+VARIANCES(16, 4)
+VARIANCES(8, 32)
+VARIANCES(32, 8)
+VARIANCES(16, 64)
+VARIANCES(64, 16)
+#endif
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void aom_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_dist_wtd_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                                  int width, int height, const uint8_t *ref,
+                                  int ref_stride,
+                                  const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint8_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint64_t *sse, int64_t *sum) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t tsum = 0;
+  uint64_t tsse = 0;
+  for (int i = 0; i < h; ++i) {
+    int32_t lsum = 0;
+    for (int j = 0; j < w; ++j) {
+      const int diff = a[j] - b[j];
+      lsum += diff;
+      tsse += (uint32_t)(diff * diff);
+    }
+    tsum += lsum;
+    a += a_stride;
+    b += b_stride;
+  }
+  *sum = tsum;
+  *sse = tsse;
+}
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h) {
+  uint64_t sse;
+  int64_t sum;
+  highbd_variance64(a, a_stride, b, b_stride, w, h, &sse, &sum);
+  return sse;
+}
+
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int w, int h,
+                              uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int a_stride,
+                               const uint8_t *b8, int b_stride, int w, int h,
+                               uint32_t *sse, int *sum) {
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H)                                                       \
+  uint32_t aom_highbd_8_variance##W##x##H##_c(const uint8_t *a, int a_stride,  \
+                                              const uint8_t *b, int b_stride,  \
+                                              uint32_t *sse) {                 \
+    int sum;                                                                   \
+    highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum);              \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));                  \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               uint32_t *sse) {                \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum);             \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));                  \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define HIGHBD_MSE(W, H)                                                      \
+  uint32_t aom_highbd_8_mse##W##x##H##_c(const uint8_t *src, int src_stride,  \
+                                         const uint8_t *ref, int ref_stride,  \
+                                         uint32_t *sse) {                     \
+    int sum;                                                                  \
+    highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);     \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          uint32_t *sse) {                    \
+    int sum;                                                                  \
+    highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum);    \
+    return *sse;                                                              \
+  }
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] = ROUND_POWER_OF_TWO(
+          (int)src_ptr[0] * filter[0] + (int)src_ptr[pixel_step] * filter[1],
+          FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_8_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W,  \
+                                              dst, dst_stride, sse);         \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }                                                                          \
+                                                                             \
+  uint32_t aom_highbd_12_sub_pixel_variance##W##x##H##_c(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint16_t temp2[H * W];                                                   \
+                                                                             \
+    aom_highbd_var_filter_block2d_bil_first_pass(                            \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_highbd_var_filter_block2d_bil_second_pass(                           \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, \
+                                               dst, dst_stride, sse);        \
+  }
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H)                                           \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##W##x##H##_c(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                              dst, dst_stride, sse);          \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##W##x##H##_c(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred) {                                           \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_comp_avg_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                               CONVERT_TO_BYTEPTR(temp2), W);                 \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,  \
+                                               dst, dst_stride, sse);         \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_8_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(         \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_8_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst,  \
+                                          dst_stride, sse);                   \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_10_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_10_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }                                                                           \
+                                                                              \
+  uint32_t aom_highbd_12_dist_wtd_sub_pixel_avg_variance##W##x##H##_c(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *dst, int dst_stride, uint32_t *sse,                      \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint16_t fdata3[(H + 1) * W];                                             \
+    uint16_t temp2[H * W];                                                    \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                              \
+                                                                              \
+    aom_highbd_var_filter_block2d_bil_first_pass(                             \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);  \
+    aom_highbd_var_filter_block2d_bil_second_pass(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);             \
+                                                                              \
+    aom_highbd_dist_wtd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, \
+                                      W, H, CONVERT_TO_BYTEPTR(temp2), W,     \
+                                      jcp_param);                             \
+                                                                              \
+    return aom_highbd_12_variance##W##x##H(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                           dst_stride, sse);                  \
+  }
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+  HIGHBD_VAR(W, H)             \
+  HIGHBD_SUBPIX_VAR(W, H)      \
+  HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_VARIANCES(4, 16)
+HIGHBD_VARIANCES(16, 4)
+HIGHBD_VARIANCES(8, 32)
+HIGHBD_VARIANCES(32, 8)
+HIGHBD_VARIANCES(16, 64)
+HIGHBD_VARIANCES(64, 16)
+#endif
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+void aom_highbd_comp_avg_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_c(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i, j;
+  const int fwd_offset = jcp_param->fwd_offset;
+  const int bck_offset = jcp_param->bck_offset;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      int tmp = pred[j] * bck_offset + ref[j] * fwd_offset;
+      tmp = ROUND_POWER_OF_TWO(tmp, DIST_PRECISION_BITS);
+      comp_pred[j] = (uint16_t)tmp;
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                          int height, const uint8_t *ref, int ref_stride,
+                          const uint8_t *mask, int mask_stride,
+                          int invert_mask) {
+  int i, j;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = AOM_BLEND_A64(mask[j], src0[j], src1[j]);
+    }
+    comp_pred += width;
+    src0 += stride0;
+    src1 += stride1;
+    mask += mask_stride;
+  }
+}
+
+#define MASK_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_c(                     \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                                \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride,  \
+                         invert_mask);                                         \
+    return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse);          \
+  }
+
+MASK_SUBPIX_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 8)
+MASK_SUBPIX_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 16)
+MASK_SUBPIX_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 32)
+MASK_SUBPIX_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 64)
+MASK_SUBPIX_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 128)
+MASK_SUBPIX_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 128)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASK_SUBPIX_VAR(4, 16)
+MASK_SUBPIX_VAR(16, 4)
+MASK_SUBPIX_VAR(8, 32)
+MASK_SUBPIX_VAR(32, 8)
+MASK_SUBPIX_VAR(16, 64)
+MASK_SUBPIX_VAR(64, 16)
+#endif
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_comp_mask_pred_c(uint8_t *comp_pred8, const uint8_t *pred8,
+                                 int width, int height, const uint8_t *ref8,
+                                 int ref_stride, const uint8_t *mask,
+                                 int mask_stride, int invert_mask) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      if (!invert_mask)
+        comp_pred[j] = AOM_BLEND_A64(mask[j], ref[j], pred[j]);
+      else
+        comp_pred[j] = AOM_BLEND_A64(mask[j], pred[j], ref[j]);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+    mask += mask_stride;
+  }
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_c(            \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,    \
+                                              ref, ref_stride, sse);           \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_c(           \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+    DECLARE_ALIGNED(16, uint16_t, temp3[H * W]);                               \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        src, fdata3, src_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    aom_highbd_comp_mask_pred_c(CONVERT_TO_BYTEPTR(temp3), second_pred, W, H,  \
+                                CONVERT_TO_BYTEPTR(temp2), W, msk, msk_stride, \
+                                invert_mask);                                  \
+                                                                               \
+    return aom_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W,   \
+                                               ref, ref_stride, sse);          \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#if !CONFIG_REALTIME_ONLY
+HIGHBD_MASK_SUBPIX_VAR(4, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 16)
+#endif
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#if !CONFIG_REALTIME_ONLY
+static INLINE void obmc_variance(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+#define OBMC_VAR(W, H)                                            \
+  unsigned int aom_obmc_variance##W##x##H##_c(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask, unsigned int *sse) {                   \
+    int sum;                                                      \
+    obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+  }
+
+#define OBMC_SUBPIX_VAR(W, H)                                                  \
+  unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c(                       \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint8_t temp2[H * W];                                                      \
+                                                                               \
+    aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \
+                                            W, bilinear_filters_2t[xoffset]);  \
+    aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W,        \
+                                             bilinear_filters_2t[yoffset]);    \
+                                                                               \
+    return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);          \
+  }
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+
+OBMC_VAR(4, 16)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_VAR(16, 4)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_VAR(8, 32)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_VAR(32, 8)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_VAR(16, 64)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_VAR(64, 16)
+OBMC_SUBPIX_VAR(64, 16)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int w, int h,
+                                          uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask, int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                              \
+  unsigned int aom_highbd_8_obmc_variance##W##x##H##_c(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);    \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_c(                   \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                           \
+  unsigned int aom_highbd_8_obmc_sub_pixel_variance##W##x##H##_c(              \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_8_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                   W, wsrc, mask, sse);        \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }                                                                            \
+                                                                               \
+  unsigned int aom_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(             \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,            \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {           \
+    uint16_t fdata3[(H + 1) * W];                                              \
+    uint16_t temp2[H * W];                                                     \
+                                                                               \
+    aom_highbd_var_filter_block2d_bil_first_pass(                              \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_highbd_var_filter_block2d_bil_second_pass(                             \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);              \
+                                                                               \
+    return aom_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, wsrc, mask, sse);       \
+  }
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+
+HIGHBD_OBMC_VAR(4, 16)
+HIGHBD_OBMC_SUBPIX_VAR(4, 16)
+HIGHBD_OBMC_VAR(16, 4)
+HIGHBD_OBMC_SUBPIX_VAR(16, 4)
+HIGHBD_OBMC_VAR(8, 32)
+HIGHBD_OBMC_SUBPIX_VAR(8, 32)
+HIGHBD_OBMC_VAR(32, 8)
+HIGHBD_OBMC_SUBPIX_VAR(32, 8)
+HIGHBD_OBMC_VAR(16, 64)
+HIGHBD_OBMC_SUBPIX_VAR(16, 64)
+HIGHBD_OBMC_VAR(64, 16)
+HIGHBD_OBMC_SUBPIX_VAR(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+#endif  // !CONFIG_REALTIME_ONLY
+
+uint64_t aom_mse_wxh_16bit_c(uint8_t *dst, int dstride, uint16_t *src,
+                             int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = (uint16_t)dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
+
+uint64_t aom_mse_16xh_16bit_c(uint8_t *dst, int dstride, uint16_t *src, int w,
+                              int h) {
+  uint16_t *src_temp = src;
+  uint8_t *dst_temp = dst;
+  const int num_blks = 16 / w;
+  int64_t sum = 0;
+  for (int i = 0; i < num_blks; i++) {
+    sum += aom_mse_wxh_16bit_c(dst_temp, dstride, src_temp, w, w, h);
+    dst_temp += w;
+    src_temp += (w * h);
+  }
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_c(uint16_t *dst, int dstride, uint16_t *src,
+                                    int sstride, int w, int h) {
+  uint64_t sum = 0;
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j++) {
+      int e = dst[i * dstride + j] - src[i * sstride + j];
+      sum += e * e;
+    }
+  }
+  return sum;
+}
diff --git a/third_party/aom/aom_dsp/variance.h b/third_party/aom/aom_dsp/variance.h
new file mode 100644
index 0000000000..6603d312b8
--- /dev/null
+++ b/third_party/aom/aom_dsp/variance.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VARIANCE_H_
+#define AOM_AOM_DSP_VARIANCE_H_
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int (*aom_sad_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *b, int b_stride);
+
+typedef unsigned int (*aom_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         const uint8_t *second_pred);
+
+typedef void (*aom_copy32xn_fn_t)(const uint8_t *a, int a_stride, uint8_t *b,
+                                  int b_stride, int n);
+
+typedef void (*aom_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride, unsigned int *sad_array);
+
+typedef unsigned int (*aom_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*aom_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*aom_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred);
+
+typedef unsigned int (*aom_dist_wtd_sad_avg_fn_t)(
+    const uint8_t *a, int a_stride, const uint8_t *b, int b_stride,
+    const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_dist_wtd_subp_avg_variance_fn_t)(
+    const uint8_t *a, int a_stride, int xoffset, int yoffset, const uint8_t *b,
+    int b_stride, unsigned int *sse, const uint8_t *second_pred,
+    const DIST_WTD_COMP_PARAMS *jcp_param);
+
+typedef unsigned int (*aom_masked_sad_fn_t)(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred,
+                                            const uint8_t *msk, int msk_stride,
+                                            int invert_mask);
+typedef unsigned int (*aom_masked_subpixvariance_fn_t)(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *ref, int ref_stride, const uint8_t *second_pred,
+    const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse);
+
+typedef unsigned int (*aom_obmc_sad_fn_t)(const uint8_t *pred, int pred_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *msk);
+typedef unsigned int (*aom_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *msk,
+                                               unsigned int *sse);
+typedef unsigned int (*aom_obmc_subpixvariance_fn_t)(
+    const uint8_t *pred, int pred_stride, int xoffset, int yoffset,
+    const int32_t *wsrc, const int32_t *msk, unsigned int *sse);
+
+typedef struct aom_variance_vtable {
+  aom_sad_fn_t sdf;
+  // Same as normal sad, but downsample the rows by a factor of 2.
+  aom_sad_fn_t sdsf;
+  aom_sad_avg_fn_t sdaf;
+  aom_variance_fn_t vf;
+  aom_subpixvariance_fn_t svf;
+  aom_subp_avg_variance_fn_t svaf;
+  aom_sad_multi_d_fn_t sdx4df;
+  aom_sad_multi_d_fn_t sdx3df;
+  // Same as sadx4, but downsample the rows by a factor of 2.
+  aom_sad_multi_d_fn_t sdsx4df;
+  aom_masked_sad_fn_t msdf;
+  aom_masked_subpixvariance_fn_t msvf;
+  aom_obmc_sad_fn_t osdf;
+  aom_obmc_variance_fn_t ovf;
+  aom_obmc_subpixvariance_fn_t osvf;
+  aom_dist_wtd_sad_avg_fn_t jsdaf;
+  aom_dist_wtd_subp_avg_variance_fn_t jsvaf;
+} aom_variance_fn_ptr_t;
+
+void aom_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+void aom_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr, uint16_t *output_ptr,
+    unsigned int src_pixels_per_line, unsigned int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint8_t *filter);
+
+uint32_t aom_sse_odd_size(const uint8_t *a, int a_stride, const uint8_t *b,
+                          int b_stride, int w, int h);
+
+uint64_t aom_highbd_sse_odd_size(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int w, int h);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_VARIANCE_H_
diff --git a/third_party/aom/aom_dsp/vmaf.c b/third_party/aom/aom_dsp/vmaf.c
new file mode 100644
index 0000000000..a40e00cb23
--- /dev/null
+++ b/third_party/aom/aom_dsp/vmaf.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom_dsp/vmaf.h"
+
+#include <assert.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef _WIN32
+#include <process.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "aom_dsp/blend.h"
+
+static void vmaf_fatal_error(const char *message) {
+  fprintf(stderr, "Fatal error: %s\n", message);
+  exit(EXIT_FAILURE);
+}
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path) {
+  if (*vmaf_model != NULL) return;
+  VmafModelConfig model_cfg;
+  model_cfg.flags = VMAF_MODEL_FLAG_DISABLE_CLIP;
+  model_cfg.name = "vmaf";
+
+  if (vmaf_model_load_from_path(vmaf_model, &model_cfg, model_path)) {
+    vmaf_fatal_error("Failed to load VMAF model.");
+  }
+}
+
+void aom_close_vmaf_model(VmafModel *vmaf_model) {
+  vmaf_model_destroy(vmaf_model);
+}
+
+static void copy_picture(const int bit_depth, const YV12_BUFFER_CONFIG *src,
+                         VmafPicture *dst) {
+  const int width = src->y_width;
+  const int height = src->y_height;
+
+  if (bit_depth > 8) {
+    uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src->y_buffer);
+    uint16_t *dst_ptr = dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0] / 2;
+    }
+  } else {
+    uint8_t *src_ptr = src->y_buffer;
+    uint8_t *dst_ptr = (uint8_t *)dst->data[0];
+
+    for (int row = 0; row < height; ++row) {
+      memcpy(dst_ptr, src_ptr, width * sizeof(dst_ptr[0]));
+      src_ptr += src->y_stride;
+      dst_ptr += dst->stride[0];
+    }
+  }
+}
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg) {
+  // TODO(sdeng): make them CLI arguments.
+  VmafConfiguration cfg;
+  cfg.log_level = VMAF_LOG_LEVEL_NONE;
+  cfg.n_threads = 0;
+  cfg.n_subsample = 0;
+  cfg.cpumask = 0;
+
+  if (vmaf_init(vmaf_context, cfg)) {
+    vmaf_fatal_error("Failed to init VMAF context.");
+  }
+
+  if (cal_vmaf_neg) {
+    VmafFeatureDictionary *vif_feature = NULL;
+    if (vmaf_feature_dictionary_set(&vif_feature, "vif_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set vif_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "float_vif", vif_feature)) {
+      vmaf_fatal_error("Failed to use feature float_vif.");
+    }
+
+    VmafFeatureDictionary *adm_feature = NULL;
+    if (vmaf_feature_dictionary_set(&adm_feature, "adm_enhn_gain_limit",
+                                    "1.0")) {
+      vmaf_fatal_error("Failed to set adm_enhn_gain_limit.");
+    }
+    if (vmaf_model_feature_overload(vmaf_model, "adm", adm_feature)) {
+      vmaf_fatal_error("Failed to use feature float_adm.");
+    }
+  }
+
+  VmafFeatureDictionary *motion_force_zero = NULL;
+  if (vmaf_feature_dictionary_set(&motion_force_zero, "motion_force_zero",
+                                  "1")) {
+    vmaf_fatal_error("Failed to set motion_force_zero.");
+  }
+  if (vmaf_model_feature_overload(vmaf_model, "float_motion",
+                                  motion_force_zero)) {
+    vmaf_fatal_error("Failed to use feature float_motion.");
+  }
+
+  if (vmaf_use_features_from_model(*vmaf_context, vmaf_model)) {
+    vmaf_fatal_error("Failed to load feature extractors from VMAF model.");
+  }
+}
+
+void aom_close_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_close(vmaf_context)) {
+    vmaf_fatal_error("Failed to close VMAF context.");
+  }
+}
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   bool cal_vmaf_neg, double *vmaf) {
+  VmafContext *vmaf_context;
+  aom_init_vmaf_context(&vmaf_context, vmaf_model, cal_vmaf_neg);
+  const int frame_index = 0;
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+
+  vmaf_score_at_index(vmaf_context, vmaf_model, vmaf, frame_index);
+  aom_close_vmaf_context(vmaf_context);
+}
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index) {
+  VmafPicture ref, dist;
+  if (vmaf_picture_alloc(&ref, VMAF_PIX_FMT_YUV420P, bit_depth, source->y_width,
+                         source->y_height) ||
+      vmaf_picture_alloc(&dist, VMAF_PIX_FMT_YUV420P, bit_depth,
+                         source->y_width, source->y_height)) {
+    vmaf_fatal_error("Failed to alloc VMAF pictures.");
+  }
+  copy_picture(bit_depth, source, &ref);
+  copy_picture(bit_depth, distorted, &dist);
+  if (vmaf_read_pictures(vmaf_context, &ref, &dist,
+                         /*picture index=*/frame_index)) {
+    vmaf_fatal_error("Failed to read VMAF pictures.");
+  }
+
+  vmaf_picture_unref(&ref);
+  vmaf_picture_unref(&dist);
+}
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index) {
+  double vmaf;
+  if (vmaf_score_at_index(vmaf_context, vmaf_model, &vmaf, frame_index)) {
+    vmaf_fatal_error("Failed to calc VMAF scores.");
+  }
+  return vmaf;
+}
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context) {
+  if (vmaf_read_pictures(vmaf_context, NULL, NULL, 0)) {
+    vmaf_fatal_error("Failed to flush context.");
+  }
+}
diff --git a/third_party/aom/aom_dsp/vmaf.h b/third_party/aom/aom_dsp/vmaf.h
new file mode 100644
index 0000000000..b539cf8b76
--- /dev/null
+++ b/third_party/aom/aom_dsp/vmaf.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_VMAF_H_
+#define AOM_AOM_DSP_VMAF_H_
+
+#include <libvmaf/libvmaf.h>
+#include <stdbool.h>
+
+#include "aom_scale/yv12config.h"
+
+void aom_init_vmaf_context(VmafContext **vmaf_context, VmafModel *vmaf_model,
+                           bool cal_vmaf_neg);
+void aom_close_vmaf_context(VmafContext *vmaf_context);
+
+void aom_init_vmaf_model(VmafModel **vmaf_model, const char *model_path);
+void aom_close_vmaf_model(VmafModel *vmaf_model);
+
+void aom_calc_vmaf(VmafModel *vmaf_model, const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                   bool cal_vmaf_neg, double *vmaf);
+
+void aom_read_vmaf_image(VmafContext *vmaf_context,
+                         const YV12_BUFFER_CONFIG *source,
+                         const YV12_BUFFER_CONFIG *distorted, int bit_depth,
+                         int frame_index);
+
+double aom_calc_vmaf_at_index(VmafContext *vmaf_context, VmafModel *vmaf_model,
+                              int frame_index);
+
+void aom_flush_vmaf_context(VmafContext *vmaf_context);
+
+#endif  // AOM_AOM_DSP_VMAF_H_
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..b3dede75d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+                                      const int16_t *round_ptr, __m256i *round,
+                                      const int16_t *quant_ptr, __m256i *quant,
+                                      const int16_t *dequant_ptr,
+                                      __m256i *dequant,
+                                      const int16_t *shift_ptr,
+                                      __m256i *shift) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+                                     const int16_t *iscan_ptr, int *is_found,
+                                     __m256i *mask) {
+  __m256i zero = _mm256_setzero_si256();
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+  coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+  coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+                                         const __m256i *quant,
+                                         const __m256i *shift) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_adds_epi16(*coeff, *round);
+  tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+  qcoeff = _mm256_add_epi16(tmp, qcoeff);
+  *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+  return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff, qcoeff;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                     dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff = load_coefficients_avx2(coeff_ptr);
+  qcoeff = _mm256_abs_epi16(coeff);
+  update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+  update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    // Reinsert signs
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    // Mask out zbin threshold coeffs
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff = load_coefficients_avx2(coeff_ptr + index);
+    qcoeff = _mm256_abs_epi16(coeff);
+    update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+    update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+    qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+    qcoeff = _mm256_and_si256(qcoeff, temp0);
+    store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+    coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+    store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff0 = qcoeff_ptr[rc];
+    if (qcoeff0) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff0 = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff0);
+      const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..503b9b4682
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr);
+    store_coefficients(coeff1, dqcoeff_ptr + 8);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, log_scale_vec);
+  round = _mm_add_epi16(round, log_scale_vec);
+  zbin = _mm_srli_epi16(zbin, log_scale);
+  round = _mm_srli_epi16(round, log_scale);
+  zbin = _mm_sub_epi16(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+                 &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      index += 16;
+      continue;
+    }
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+    index += 16;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 0000000000..b08ec2546b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v4_sse2;
+filter8_1dfunction aom_filter_block1d16_h4_sse2;
+
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+//                                      ptrdiff_t src_stride,
+//                                      uint8_t *dst,
+//                                      ptrdiff_t dst_stride,
+//                                      const int16_t *filter_x,
+//                                      int x_step_q4,
+//                                      const int16_t *filter_y,
+//                                      int y_step_q4,
+//                                      int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+//                                     ptrdiff_t src_stride,
+//                                     uint8_t *dst,
+//                                     ptrdiff_t dst_stride,
+//                                     const int16_t *filter_x,
+//                                     int x_step_q4,
+//                                     const int16_t *filter_y,
+//                                     int y_step_q4,
+//                                     int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+#endif
+#endif  // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
new file mode 100644
index 0000000000..a1043828fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -0,0 +1,256 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+  __m256i s[4];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+  __m256i s[8];
+  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m256i s[2];
+      s[0] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      s[1] = _mm256_loadu_si256((__m256i *)src);
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[0]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m256i s[4];
+      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+      src += src_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    assert(w == 128);
+    do {
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
new file mode 100644
index 0000000000..e78845e97c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -0,0 +1,308 @@
+/*
+ *  Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 2 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      memmove(dst, src, 4 * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+  __m128i s[8];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+  __m128i s[16];
+  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
+                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
+                                   int h) {
+  if (w >= 16) {
+    assert(!((intptr_t)dst % 16));
+    assert(!(dst_stride % 16));
+  }
+
+  if (w == 2) {
+    do {
+      __m128i s = _mm_loadl_epi64((__m128i *)src);
+      *(int *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      s = _mm_loadl_epi64((__m128i *)src);
+      *(int *)dst = _mm_cvtsi128_si32(s);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 4) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadl_epi64((__m128i *)src);
+      src += src_stride;
+      _mm_storel_epi64((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_storel_epi64((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 8) {
+    do {
+      __m128i s[2];
+      s[0] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      s[1] = _mm_loadu_si128((__m128i *)src);
+      src += src_stride;
+      _mm_store_si128((__m128i *)dst, s[0]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)dst, s[1]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 16) {
+    do {
+      __m128i s[4];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 32) {
+    do {
+      __m128i s[8];
+      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+      src += src_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+      dst += dst_stride;
+      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else if (w == 64) {
+    do {
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_64(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  } else {
+    do {
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      highbd_copy_128(src, dst);
+      src += src_stride;
+      dst += dst_stride;
+      h -= 2;
+    } while (h);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..d392225906
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,613 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm6
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+    punpcklwd   xmm1, xmm7
+
+    movdqa      k0k6, xmm0
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+    movdqa      k1k7, xmm1
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)      ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+    punpcklwd   xmm0, xmm6                  ;two row in one register
+    punpcklwd   xmm1, xmm7
+    punpcklwd   xmm2, xmm5
+    punpcklwd   xmm3, xmm4
+
+    pmaddwd     xmm0, k0k6                  ;multiply the filter factors
+    pmaddwd     xmm1, k1k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm3, k3k4
+
+    paddd       xmm0, xmm1                  ;sum
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+
+    paddd       xmm0, krd                   ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+    punpcklqdq  xmm2, xmm2
+    punpcklqdq  xmm3, xmm3
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm6, xmm7
+    punpckhwd   xmm2, xmm5
+    punpckhwd   xmm3, xmm4
+
+    movdqa      k0k1, xmm0                  ;store filter factors on stack
+    movdqa      k6k7, xmm6
+    movdqa      k2k5, xmm2
+    movdqa      k3k4, xmm3
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    ;Compute max and min values of a pixel
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm0, rdx
+    movq        xmm1, rcx
+    pshufd      xmm0, xmm0, 0b
+    movdqa      xmm2, xmm0
+    psllw       xmm0, xmm1
+    psubw       xmm0, xmm2
+    pxor        xmm1, xmm1
+    movdqa      max, xmm0                  ;max value (for clamping)
+    movdqa      min, xmm1                  ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+    movdqu      xmm0, [rsi + %1]            ;0
+    movdqu      xmm1, [rsi + rax + %1]      ;1
+    movdqu      xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movdqu      xmm7, [rsi + rdx * 2 + %1]  ;7
+    movdqu      xmm2, [rsi + rax + %1]      ;2
+    movdqu      xmm3, [rsi + rax * 2 + %1]  ;3
+    movdqu      xmm4, [rsi + rdx + %1]      ;4
+    movdqu      xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm0
+    punpcklwd   xmm0, xmm1
+    punpckhwd   xmm4, xmm1
+    movdqa      xmm1, xmm6
+    punpcklwd   xmm6, xmm7
+    punpckhwd   xmm1, xmm7
+    movdqa      xmm7, xmm2
+    punpcklwd   xmm2, xmm5
+    punpckhwd   xmm7, xmm5
+
+    movdqu      xmm5, temp
+    movdqu      temp, xmm4
+    movdqa      xmm4, xmm3
+    punpcklwd   xmm3, xmm5
+    punpckhwd   xmm4, xmm5
+    movdqu      xmm5, temp
+
+    pmaddwd     xmm0, k0k1
+    pmaddwd     xmm5, k0k1
+    pmaddwd     xmm6, k6k7
+    pmaddwd     xmm1, k6k7
+    pmaddwd     xmm2, k2k5
+    pmaddwd     xmm7, k2k5
+    pmaddwd     xmm3, k3k4
+    pmaddwd     xmm4, k3k4
+
+    paddd       xmm0, xmm6
+    paddd       xmm0, xmm2
+    paddd       xmm0, xmm3
+    paddd       xmm5, xmm1
+    paddd       xmm5, xmm7
+    paddd       xmm5, xmm4
+
+    paddd       xmm0, krd                   ;rounding
+    paddd       xmm5, krd
+    psrad       xmm0, 7                     ;shift
+    psrad       xmm5, 7
+    packssdw    xmm0, xmm5                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi + %2]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_v8_sse2)
+sym(aom_highbd_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movq        xmm0, [rsi]                 ;load src: row 0
+    movq        xmm1, [rsi + rax]           ;1
+    movq        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2]       ;7
+    movq        xmm2, [rsi + rax]           ;2
+    movq        xmm3, [rsi + rax * 2]       ;3
+    movq        xmm4, [rsi + rdx]           ;4
+    movq        xmm5, [rsi + rax * 4]       ;5
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_v8_sse2)
+sym(aom_highbd_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_v8_sse2)
+sym(aom_highbd_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rbx, [rbx + rbx]
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    HIGH_APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 16
+    HIGH_APPLY_FILTER_8 0, 16
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_h8_sse2)
+sym(aom_highbd_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 7
+    %define k0k6 [rsp + 16 * 0]
+    %define k2k5 [rsp + 16 * 1]
+    %define k3k4 [rsp + 16 * 2]
+    %define k1k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define max [rsp + 16 * 5]
+    %define min [rsp + 16 * 6]
+
+    HIGH_GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm4,   [rsi + 2]
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm4
+    movdqa      xmm7, xmm4
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm4
+
+    psrldq      xmm1, 2
+    psrldq      xmm6, 4
+    psrldq      xmm7, 6
+    psrldq      xmm2, 4
+    psrldq      xmm3, 6
+    psrldq      xmm5, 2
+
+    HIGH_APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 7
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_h8_sse2)
+sym(aom_highbd_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_h8_sse2)
+sym(aom_highbd_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 8
+    %define k0k1 [rsp + 16 * 0]
+    %define k6k7 [rsp + 16 * 1]
+    %define k2k5 [rsp + 16 * 2]
+    %define k3k4 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define temp [rsp + 16 * 5]
+    %define max [rsp + 16 * 6]
+    %define min [rsp + 16 * 7]
+
+    HIGH_GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    lea         rax, [rax + rax]            ;bytes per line
+    lea         rdx, [rdx + rdx]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 6]           ;load src
+    movdqu      xmm1,   [rsi - 4]
+    movdqu      xmm2,   [rsi - 2]
+    movdqu      xmm3,   [rsi]
+    movdqu      xmm4,   [rsi + 2]
+    movdqu      xmm5,   [rsi + 4]
+    movdqu      xmm6,   [rsi + 6]
+    movdqu      xmm7,   [rsi + 8]
+
+    HIGH_APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 10]           ;load src
+    movdqu      xmm1,   [rsi + 12]
+    movdqu      xmm2,   [rsi + 14]
+    movdqu      xmm3,   [rsi + 16]
+    movdqu      xmm4,   [rsi + 18]
+    movdqu      xmm5,   [rsi + 20]
+    movdqu      xmm6,   [rsi + 22]
+    movdqu      xmm7,   [rsi + 24]
+
+    HIGH_APPLY_FILTER_8 0, 16
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 8
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..db4cad9bcb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklwd   xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm5, rdx
+    movq        xmm2, rcx
+    pshufd      xmm5, xmm5, 0b
+    movdqa      xmm1, xmm5
+    psllw       xmm5, xmm2
+    psubw       xmm5, xmm1                  ;max value (for clamping)
+    pxor        xmm2, xmm2                  ;min value (for clamping)
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+    punpcklwd   xmm0, xmm1                  ;two row in one register
+    pmaddwd     xmm0, xmm4                  ;multiply the filter factors
+
+    paddd       xmm0, xmm3                  ;rounding
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm0                  ;pack to word
+
+    ;clamp the values
+    pminsw      xmm0, xmm5
+    pmaxsw      xmm0, xmm2
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+
+    movq        [rdi], xmm0
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x00000040
+
+    movdqa      xmm6, [rdx]                 ;load filters
+
+    pshuflw     xmm7, xmm6, 11111111b       ;k3
+    pshufhw     xmm6, xmm6, 0b              ;k4
+    psrldq      xmm6, 8
+    punpcklwd   xmm7, xmm6                  ;k3k4k3k4k3k4k3k4
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    mov         rdx, 0x00010001
+    movsxd      rcx, DWORD PTR arg(6)       ;bps
+    movq        xmm3, rdx
+    movq        xmm5, rcx
+    pshufd      xmm3, xmm3, 0b
+    movdqa      xmm1, xmm3
+    psllw       xmm3, xmm5
+    psubw       xmm3, xmm1                  ;max value (for clamping)
+    pxor        xmm5, xmm5                  ;min value (for clamping)
+
+    movdqa      max, xmm3
+    movdqa      min, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+    movdqa      xmm6, xmm0
+    punpckhwd   xmm6, xmm1
+    punpcklwd   xmm0, xmm1
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+
+    paddd       xmm6, xmm4                  ;rounding
+    paddd       xmm0, xmm4                  ;rounding
+    psrad       xmm6, 7                     ;shift
+    psrad       xmm0, 7                     ;shift
+    packssdw    xmm0, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgw       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+    movdqa      xmm5, xmm0
+    movdqa      xmm6, xmm2
+    punpckhwd   xmm5, xmm1
+    punpckhwd   xmm6, xmm3
+    punpcklwd   xmm0, xmm1
+    punpcklwd   xmm2, xmm3
+
+    pmaddwd     xmm5, xmm7
+    pmaddwd     xmm6, xmm7
+    pmaddwd     xmm0, xmm7
+    pmaddwd     xmm2, xmm7
+
+    paddd       xmm5, xmm4                  ;rounding
+    paddd       xmm6, xmm4
+    paddd       xmm0, xmm4
+    paddd       xmm2, xmm4
+
+    psrad       xmm5, 7                     ;shift
+    psrad       xmm6, 7
+    psrad       xmm0, 7
+    psrad       xmm2, 7
+
+    packssdw    xmm0, xmm5                  ;pack back to word
+    packssdw    xmm2, xmm6                  ;pack back to word
+
+    ;clamp the values
+    pminsw      xmm0, max
+    pmaxsw      xmm0, min
+    pminsw      xmm2, max
+    pmaxsw      xmm2, min
+
+%if %1
+    movdqu      xmm1, [rdi]
+    movdqu      xmm3, [rdi + 16]
+    pavgw       xmm0, xmm1
+    pavgw       xmm2, xmm3
+%endif
+    movdqu      [rdi], xmm0               ;store the result
+    movdqu      [rdi + 16], xmm2          ;store the result
+
+    lea         rsi, [rsi + 2*rax]
+    lea         rdi, [rdi + 2*rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_highbd_filter_block1d4_v2_sse2)
+sym(aom_highbd_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movq        xmm0, [rsi]                 ;load src
+    movq        xmm1, [rsi + 2*rax]
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_highbd_filter_block1d8_v2_sse2)
+sym(aom_highbd_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;0
+    movdqu      xmm1, [rsi + 2*rax]         ;1
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_highbd_filter_block1d16_v2_sse2)
+sym(aom_highbd_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm2, [rsi + 16]
+    movdqu        xmm1, [rsi + 2*rax]       ;1
+    movdqu        xmm3, [rsi + 2*rax + 16]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_highbd_filter_block1d4_h2_sse2)
+sym(aom_highbd_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    HIGH_GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 2
+
+    HIGH_APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_highbd_filter_block1d8_h2_sse2)
+sym(aom_highbd_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 8
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqu      xmm1, [rsi + 2]
+
+    HIGH_APPLY_FILTER_8 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_highbd_filter_block1d16_h2_sse2)
+sym(aom_highbd_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    SAVE_XMM 9
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 2
+    %define max [rsp + 16 * 0]
+    %define min [rsp + 16 * 1]
+
+    HIGH_GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 2]
+    movdqu      xmm2,   [rsi + 16]
+    movdqu      xmm3,   [rsi + 18]
+
+    HIGH_APPLY_FILTER_16 0
+    jnz         .loop
+
+    add rsp, 16 * 2
+    pop rsp
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_quantize_avx.c b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
new file mode 100644
index 0000000000..b2d6d4b76d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               tran_low_t *dqcoeff) {
+  const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+  const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  *eob_ptr = 0;
+
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    if (n_coeffs == 16) return;
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    // Reinsert signs
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m256i big_zero = _mm256_setzero_si256();
+  int index;
+  const int log_scale = 1;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+
+  // Setup global values.
+  // The 32x32 halves zbin and round.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, one);
+  zbin = _mm_srli_epi16(zbin, 1);
+  // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+  // it is a strict "greater" comparison.
+  zbin = _mm_sub_epi16(zbin, one);
+
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  round = _mm_add_epi16(round, one);
+  round = _mm_srli_epi16(round, 1);
+
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_tran_low(coeff_ptr);
+  coeff1 = load_tran_low(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC.
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_test_all_zeros(all_zero, all_zero)) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr);
+    store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+                                          &log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + 8, &log_scale);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < n_coeffs; index += 16) {
+    coeff0 = load_tran_low(coeff_ptr + index);
+    coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_test_all_zeros(all_zero, all_zero)) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+      continue;
+    }
+
+    calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+    calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + index);
+    store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+                                          dqcoeff_ptr + index, &log_scale);
+    calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+                                          dqcoeff_ptr + index + 8, &log_scale);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..22f2e696d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_ports/mem.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) ||            \
+    (__clang_major__ == 3 && __clang_minor__ <= 3) ||          \
+    (defined(__APPLE__) && defined(__apple_build_version__) && \
+     ((__clang_major__ == 4 && __clang_minor__ <= 2) ||        \
+      (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else  // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+  _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else  // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // gcc <= 4.6
+#else   // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif  // __clang__
+
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+  *((int *)(output_ptr + stride)) =
+      _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+                                    const ptrdiff_t stride, const __m256i *a) {
+  _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_storel_epi64((__m128i *)(output_ptr + stride),
+                   _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+                                   const ptrdiff_t stride, const __m256i *a) {
+  _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+  _mm_store_si128((__m128i *)(output_ptr + stride),
+                  _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+  filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d4_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg;
+  __m256i firstFilters, secondFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 32 bits
+  firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+  // duplicate only the second 32 bits
+  secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    // filter the source buffer
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    srcRegFilt32b1_1 =
+        _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 4 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+    srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 4 bytes
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 =
+        _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 8 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcRegFilt1_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d16_h4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt2Reg, filt3Reg;
+  __m256i secondFilters, thirdFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m256i srcReg1, srcReg12;
+    __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+    srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+    srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+    // filter the source buffer
+    srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+    srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+    srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+    srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+    srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr,
+                    _mm256_castsi256_si128(srcRegFilt1_1));
+  }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m256i srcReg32b1, srcReg32b2, filtersReg32;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+  src_ptr -= 3;
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+  filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+  filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+  filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pixels_per_line << 1;
+  dst_stride = output_pitch << 1;
+  for (i = output_height; i > 1; i -= 2) {
+    // load the 2 strides of source
+    srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+    // reading 2 strides of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 =
+        xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+    // filter the source buffer
+    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm256_adds_epi16(
+        srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+    output_ptr += dst_stride;
+  }
+
+  // if the number of strides is odd.
+  // process only 16 bytes
+  if (i > 0) {
+    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+    __m128i srcRegFilt2, srcRegFilt3;
+
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+    // filter the source buffer
+    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1_1 =
+        _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // reading the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt2_1 =
+        _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+    // filter the source buffer
+    srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+    srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt3 =
+        _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+    srcRegFilt2 =
+        _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1_1 =
+        _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+    srcRegFilt2_1 =
+        _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+  }
+}
+
+static void aom_filter_block1d8_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i resReg23_34_lo, resReg45_56_lo;
+  __m256i resReglo, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d8_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+    // save 8 bytes
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d16_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+  __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+  __m256i resReglo, resReghi, resReg;
+  __m256i secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+  srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+    srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+    resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+    resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+    resReghi = _mm256_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg23_34_hi = srcReg45_56_hi;
+    srcReg4x = srcReg6x;
+  }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i addFilterReg32;
+  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+  __m256i srcReg32b11, srcReg32b12, filtersReg32;
+  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  // duplicate only the first 16 bits (first and second byte)
+  // across 256 bit register
+  firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+  // duplicate only the forth 16 bits (seventh and eighth byte)
+  // across 256 bit register
+  forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  // load 16 bytes 7 times in stride of src_pitch
+  srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+  srcReg32b3 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg32b5 =
+      xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+  srcReg32b7 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+  // have each consecutive loads on the same 256 register
+  srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+  srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+  srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+  // merge every two consecutive registers except the last one
+  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+  // save
+  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg32b8 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+    srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+                                         _mm256_castsi256_si128(srcReg32b8), 1);
+    srcReg32b9 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+    srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+                                         _mm256_castsi256_si128(srcReg32b9), 1);
+
+    // merge every two consecutive registers
+    // save
+    srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+    srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+                                    _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+    srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+    srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+    // add and saturate the results together
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+                                   _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+    // shift by 6 bit each 16 bit
+    srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+    srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+    srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+    srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+    src_ptr += src_stride;
+
+    xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg32b10 = srcReg32b11;
+    srcReg32b1 = srcReg32b3;
+    srcReg32b11 = srcReg32b2;
+    srcReg32b3 = srcReg32b5;
+    srcReg32b2 = srcReg32b4;
+    srcReg32b5 = srcReg32b7;
+    srcReg32b7 = srcReg32b9;
+  }
+  if (i > 0) {
+    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+    // load the last 16 bytes
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+    // merge the last 2 results together
+    srcRegFilt4 =
+        _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+    srcRegFilt7 =
+        _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt4 =
+        _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+                                    _mm256_castsi256_si128(firstFilters));
+    srcRegFilt7 =
+        _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+                                    _mm256_castsi256_si128(secondFilters));
+    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+                                    _mm256_castsi256_si128(secondFilters));
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+                                    _mm256_castsi256_si128(thirdFilters));
+    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+                                    _mm256_castsi256_si128(thirdFilters));
+
+    // add and saturate the results together
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt1 =
+        _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt3 =
+        _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+    // save 16 bytes
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+  }
+}
+
+static void aom_filter_block1d4_v4_avx2(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m256i filtersReg32, addFilterReg32;
+  __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+  __m256i srcReg23_34_lo, srcReg45_56_lo;
+  __m256i srcReg2345_3456_lo;
+  __m256i resReglo, resReg;
+  __m256i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm256_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+  // have the same data in both lanes of a 256 bit register
+  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+  firstFilters =
+      _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+  srcReg4x = _mm256_castsi128_si256(
+      _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+  srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    // load the last 2 loads of 16 bytes and have every two
+    // consecutive loads in the same 256 bit register
+    srcReg5x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+    srcReg45 =
+        _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+    srcReg6x = _mm256_castsi128_si256(
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+    srcReg56 =
+        _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+    resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+    resReglo = _mm256_srai_epi16(resReglo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+    src_ptr += src_stride;
+
+    xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4x = srcReg6x;
+  }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
new file mode 100644
index 0000000000..5c36b68727
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+                                  ptrdiff_t src_pixels_per_line,
+                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
+      srcRegFilt32b2_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
+    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    ss_2 = _mm_srli_si128(srcReg32b2, 2);
+    ss_4 = _mm_srli_si128(srcReg32b2, 4);
+    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
+
+    ss_1 = _mm_srli_si128(srcReg32b2, 3);
+    ss_3 = _mm_srli_si128(srcReg32b2, 5);
+    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
+
+    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                  uint32_t output_height,
+                                  const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
+  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
+  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
+    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
+    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
+    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
+    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
+    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
+    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg23_hi_1 = resReg45_hi_1;
+    resReg23_hi_2 = resReg45_hi_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    resReg34_hi_1 = resReg56_hi_1;
+    resReg34_hi_2 = resReg56_hi_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23_lo_1 = resReg45_lo_1;
+    resReg23_lo_2 = resReg45_lo_2;
+    resReg34_lo_1 = resReg56_lo_1;
+    resReg34_lo_2 = resReg56_lo_2;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+                                 ptrdiff_t src_pixels_per_line,
+                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                 uint32_t output_height,
+                                 const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23_34, resReg45_56;
+  __m128i resReg23_34_45_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  __m128i tmp_0, tmp_1;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+    // add and saturate the results together
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_34_45_56 =
+        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+    *((int *)(output_ptr + out_pitch)) =
+        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    resReg23 = resReg45;
+    resReg34 = resReg56;
+    srcReg4 = srcReg6;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..245fda1e94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,847 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+static void aom_filter_block1d4_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
+
+  for (i = output_height; i > 0; i -= 1) {
+    // load the 2 strides of source
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
+
+    // multiply 4 adjacent elements with the filter and add the result
+    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d4_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32;
+  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
+      srcReg6, srcReg56;
+  __m128i srcReg23_34_lo, srcReg45_56_lo;
+  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
+  __m128i resReglo, resReghi;
+  __m128i firstFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
+
+  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
+
+    // merge every two consecutive registers
+    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
+
+    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
+
+    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
+    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
+
+    // shift by 6 bit each 16 bit
+    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
+    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
+    resReglo = _mm_srai_epi16(resReglo, 6);
+    resReghi = _mm_srai_epi16(resReghi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReglo = _mm_packus_epi16(resReglo, resReglo);
+    resReghi = _mm_packus_epi16(resReghi, resReghi);
+
+    src_ptr += src_stride;
+
+    *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
+    *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_34_lo = srcReg45_56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
+static void aom_filter_block1d8_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+    // shrink to 8 bit each 16 bits
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d8_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+  __m128i resReg23, resReg34, resReg45, resReg56;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
+    resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
+    resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
+    resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
+    resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
+    resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
+    resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
+    resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
+    resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23 = srcReg45;
+    srcReg34 = srcReg56;
+    srcReg4 = srcReg6;
+  }
+}
+
+static void aom_filter_block1d16_h4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i addFilterReg32, filt2Reg, filt3Reg;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  // converting the 16 bit (short) to 8 bit (byte) and have the same data
+  // in both lanes of 128 bit register.
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 256 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 256 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+  for (i = output_height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // reading stride of the next 16 bytes
+    // (part of it was being read by earlier read)
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+    // filter the source buffer
+    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+    // add and saturate the results together
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+    // shift by 6 bit each 16 bit
+    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve result
+    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+    src_ptr += src_pixels_per_line;
+
+    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+    output_ptr += output_pitch;
+  }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg32, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  addFilterReg32 = _mm_set1_epi16(32);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  // converting the 16 bit (short) to  8 bit (byte) and have the
+  // same data in both lanes of 128 bit register.
+  filtersReg = _mm_srai_epi16(filtersReg, 1);
+  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+  // duplicate only the second 16 bits (third and forth byte)
+  // across 128 bit register
+  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+  // duplicate only the third 16 bits (fifth and sixth byte)
+  // across 128 bit register
+  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = out_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+  // have consecutive loads on the same 256 register
+  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+  for (i = output_height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+    // add and saturate the results together
+    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+    // shift by 6 bit each 16 bit
+    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+    output_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+    const __m128i *const s, const int16_t *const filter) {
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+  return convolve8_8_ssse3(s, f);
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const x_filter) {
+  __m128i s[8], ss[4], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  // 00 01 10 11 20 21 30 31  40 41 50 51 60 61 70 71
+  // 02 03 12 13 22 23 32 33  42 43 52 53 62 63 72 73
+  // 04 05 14 15 24 25 34 35  44 45 54 55 64 65 74 75
+  // 06 07 16 17 26 27 36 37  46 47 56 57 66 67 76 77
+  transpose_16bit_4x8(s, ss);
+  temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[8];
+
+  load_8bit_8x8(src, src_stride, s);
+  transpose_8bit_8x8(s, s);
+  store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  // This function processes 8x8 areas. The intermediate height is not always
+  // a multiple of 8, so force it to be a multiple of 8 here.
+  y = h + (8 - (h & 0x7));
+
+  do {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 8) {
+      // process 8 src_x steps
+      for (z = 0; z < 8; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 8; ++i) {
+            temp[z * 8 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 8x8 filters values back to dst
+      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+    }
+
+    src += src_stride * 8;
+    dst += dst_stride * 8;
+  } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter) {
+  __m128i s[4];
+  __m128i temp;
+
+  load_8bit_8x4(src, src_stride, s);
+  transpose_16bit_4x4(s, s);
+
+  temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+  // shrink to 8 bit each 16 bits
+  temp = _mm_packus_epi16(temp, temp);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+                                const ptrdiff_t src_stride, uint8_t *const dst,
+                                const ptrdiff_t dst_stride) {
+  __m128i s[4];
+
+  load_8bit_4x4(src, src_stride, s);
+  s[0] = transpose_8bit_4x4(s);
+  s[1] = _mm_srli_si128(s[0], 4);
+  s[2] = _mm_srli_si128(s[0], 8);
+  s[3] = _mm_srli_si128(s[0], 12);
+  store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+                                    const ptrdiff_t src_stride, uint8_t *dst,
+                                    const ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    const int x0_q4, const int x_step_q4,
+                                    const int w, const int h) {
+  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+  int x, y, z;
+  src -= SUBPEL_TAPS / 2 - 1;
+
+  for (y = 0; y < h; y += 4) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; x += 4) {
+      // process 4 src_x steps
+      for (z = 0; z < 4; ++z) {
+        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+        if (x_q4 & SUBPEL_MASK) {
+          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+        } else {
+          int i;
+          for (i = 0; i < 4; ++i) {
+            temp[z * 4 + i] = src_x[i * src_stride + 3];
+          }
+        }
+        x_q4 += x_step_q4;
+      }
+
+      // transpose the 4x4 filters values back to dst
+      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+    }
+
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+  }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+                                  const int16_t *const filter) {
+  __m128i ss[4];
+  __m128i temp;
+
+  // 00 10 01 11 02 12 03 13
+  ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  // 20 30 21 31 22 32 23 33
+  ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+  // 40 50 41 51 42 52 43 53
+  ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+  // 60 70 61 71 62 72 63 73
+  ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+  temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+  // shrink to 8 bit each 16 bits
+  return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8];
+  __m128i temp;
+
+  load_8bit_4x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 4 bytes
+  *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+                                 const ptrdiff_t src_stride, uint8_t *const dst,
+                                 const int16_t *const filter) {
+  __m128i s[8], temp;
+
+  load_8bit_8x8(src, src_stride, s);
+  temp = filter_vert_kernel(s, filter);
+  // save only 8 bytes convolve result
+  _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+                                  const ptrdiff_t src_stride,
+                                  uint8_t *const dst,
+                                  const int16_t *const filter, const int w) {
+  int i;
+  __m128i f[4];
+  shuffle_filter_ssse3(filter, f);
+
+  for (i = 0; i < w; i += 16) {
+    __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+    loadu_8bit_16x8(src, src_stride, s);
+
+    // merge the result together
+    s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+    s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+    s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+    s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+    s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+    s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+    s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+    s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+    temp_lo = convolve8_8_ssse3(s_lo, f);
+    temp_hi = convolve8_8_ssse3(s_hi, f);
+
+    // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+    // result and the second lane contain the second convolve result
+    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+    src += 16;
+    // save 16 bytes convolve result
+    _mm_store_si128((__m128i *)&dst[i], temp_hi);
+  }
+}
+
+static void scaledconvolve_vert_w16(
+    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+    const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+    const int y0_q4, const int y_step_q4, const int w, const int h) {
+  int y;
+  int y_q4 = y0_q4;
+
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (y = 0; y < h; ++y) {
+    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+    if (y_q4 & SUBPEL_MASK) {
+      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+                            w);
+    } else {
+      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+    }
+    y_q4 += y_step_q4;
+  }
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                         ptrdiff_t dst_stride, const InterpKernel *filter,
+                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                         int w, int h) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  // --Require an additional 8 rows for the horiz_w8 transpose tail.
+  // When calling in frame scaling function, the smallest scaling factor is x1/4
+  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+  // big enough.
+  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  const int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= 64);
+  assert(h <= 64);
+  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+  assert(x_step_q4 <= 64);
+
+  if (w >= 8) {
+    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  } else {
+    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+                            intermediate_height);
+  }
+
+  if (w >= 16) {
+    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                            dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else if (w == 8) {
+    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  } else {
+    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+                           dst_stride, filter, y0_q4, y_step_q4, w, h);
+  }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..640c5b2416
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,615 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    psrldq      xmm7, 8
+    pshuflw     xmm4, xmm7, 0b              ;k4
+    pshuflw     xmm5, xmm7, 01010101b       ;k5
+    pshuflw     xmm6, xmm7, 10101010b       ;k6
+    pshuflw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklqdq  xmm0, xmm1
+    punpcklqdq  xmm2, xmm3
+    punpcklqdq  xmm5, xmm4
+    punpcklqdq  xmm6, xmm7
+
+    movdqa      k0k1, xmm0
+    movdqa      k2k3, xmm2
+    movdqa      k5k4, xmm5
+    movdqa      k6k7, xmm6
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpckldq   xmm6, xmm7
+    punpckldq   xmm2, xmm3
+    punpckldq   xmm5, xmm4
+
+    punpcklbw   xmm0, zero                  ;unpack to word
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+
+    pmullw      xmm0, k0k1                  ;multiply the filter factors
+    pmullw      xmm6, k6k7
+    pmullw      xmm2, k2k3
+    pmullw      xmm5, k5k4
+
+    paddsw      xmm0, xmm6                  ;sum
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm2
+    psrldq      xmm2, 8
+    paddsw      xmm0, xmm5
+    psrldq      xmm5, 8
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    pshuflw     xmm0, xmm7, 0b              ;k0
+    pshuflw     xmm1, xmm7, 01010101b       ;k1
+    pshuflw     xmm2, xmm7, 10101010b       ;k2
+    pshuflw     xmm3, xmm7, 11111111b       ;k3
+    pshufhw     xmm4, xmm7, 0b              ;k4
+    pshufhw     xmm5, xmm7, 01010101b       ;k5
+    pshufhw     xmm6, xmm7, 10101010b       ;k6
+    pshufhw     xmm7, xmm7, 11111111b       ;k7
+
+    punpcklwd   xmm0, xmm0
+    punpcklwd   xmm1, xmm1
+    punpcklwd   xmm2, xmm2
+    punpcklwd   xmm3, xmm3
+    punpckhwd   xmm4, xmm4
+    punpckhwd   xmm5, xmm5
+    punpckhwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movdqa      k0,   xmm0                  ;store filter factors on stack
+    movdqa      k1,   xmm1
+    movdqa      k2,   xmm2
+    movdqa      k3,   xmm3
+    movdqa      k4,   xmm4
+    movdqa      k5,   xmm5
+    movdqa      k6,   xmm6
+    movdqa      k7,   xmm7
+
+    movq        xmm6, rcx
+    pshufd      xmm6, xmm6, 0
+    movdqa      krd, xmm6                   ;rounding
+
+    pxor        xmm7, xmm7
+    movdqa      zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+    movq        xmm0, [rsi + %1]            ;0
+    movq        xmm1, [rsi + rax + %1]      ;1
+    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
+    lea         rsi,  [rsi + rax]
+    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
+    movq        xmm2, [rsi + rax + %1]      ;2
+    movq        xmm3, [rsi + rax * 2 + %1]  ;3
+    movq        xmm4, [rsi + rdx + %1]      ;4
+    movq        xmm5, [rsi + rax * 4 + %1]  ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+    punpcklbw   xmm0, zero
+    punpcklbw   xmm1, zero
+    punpcklbw   xmm6, zero
+    punpcklbw   xmm7, zero
+    punpcklbw   xmm2, zero
+    punpcklbw   xmm5, zero
+    punpcklbw   xmm3, zero
+    punpcklbw   xmm4, zero
+
+    pmullw      xmm0, k0
+    pmullw      xmm1, k1
+    pmullw      xmm6, k6
+    pmullw      xmm7, k7
+    pmullw      xmm2, k2
+    pmullw      xmm5, k5
+    pmullw      xmm3, k3
+    pmullw      xmm4, k4
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm6
+    paddsw      xmm0, xmm7
+    paddsw      xmm0, xmm2
+    paddsw      xmm0, xmm5
+    paddsw      xmm0, xmm3
+    paddsw      xmm0, xmm4
+
+    paddsw      xmm0, krd                   ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi + %2]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d4_v8_sse2)
+sym(aom_filter_block1d4_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movd        xmm0, [rsi]                 ;load src: row 0
+    movd        xmm1, [rsi + rax]           ;1
+    movd        xmm6, [rsi + rdx * 2]       ;6
+    lea         rsi,  [rsi + rax]
+    movd        xmm7, [rsi + rdx * 2]       ;7
+    movd        xmm2, [rsi + rax]           ;2
+    movd        xmm3, [rsi + rax * 2]       ;3
+    movd        xmm4, [rsi + rdx]           ;4
+    movd        xmm5, [rsi + rax * 4]       ;5
+
+    APPLY_FILTER_4 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d8_v8_sse2)
+sym(aom_filter_block1d8_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+
+    lea         rdi, [rdi + rbx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+;    unsigned char *src_ptr,
+;    unsigned int   src_pitch,
+;    unsigned char *output_ptr,
+;    unsigned int   out_pitch,
+;    unsigned int   output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d16_v8_sse2)
+sym(aom_filter_block1d16_v8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
+    lea         rdx, [rax + rax * 2]
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    LOAD_VERT_8 0
+    APPLY_FILTER_8 0, 0
+    sub         rsi, rax
+
+    LOAD_VERT_8 8
+    APPLY_FILTER_8 0, 8
+    add         rdi, rbx
+
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+    pop rbx
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d4_h8_sse2)
+sym(aom_filter_block1d4_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 6
+    %define k0k1 [rsp + 16 * 0]
+    %define k2k3 [rsp + 16 * 1]
+    %define k5k4 [rsp + 16 * 2]
+    %define k6k7 [rsp + 16 * 3]
+    %define krd [rsp + 16 * 4]
+    %define zero [rsp + 16 * 5]
+
+    GET_FILTERS_4
+
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm3, 3
+    psrldq      xmm5, 5
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_4 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 6
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d8_h8_sse2)
+sym(aom_filter_block1d8_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+;    unsigned char  *src_ptr,
+;    unsigned int    src_pixels_per_line,
+;    unsigned char  *output_ptr,
+;    unsigned int    output_pitch,
+;    unsigned int    output_height,
+;    short *filter
+;)
+globalsym(aom_filter_block1d16_h8_sse2)
+sym(aom_filter_block1d16_h8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 10
+    %define k0 [rsp + 16 * 0]
+    %define k1 [rsp + 16 * 1]
+    %define k2 [rsp + 16 * 2]
+    %define k3 [rsp + 16 * 3]
+    %define k4 [rsp + 16 * 4]
+    %define k5 [rsp + 16 * 5]
+    %define k6 [rsp + 16 * 6]
+    %define k7 [rsp + 16 * 7]
+    %define krd [rsp + 16 * 8]
+    %define zero [rsp + 16 * 9]
+
+    GET_FILTERS
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+
+.loop:
+    movdqu      xmm0,   [rsi - 3]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 0
+
+    movdqu      xmm0,   [rsi + 5]           ;load src
+
+    movdqa      xmm1, xmm0
+    movdqa      xmm6, xmm0
+    movdqa      xmm7, xmm0
+    movdqa      xmm2, xmm0
+    movdqa      xmm5, xmm0
+    movdqa      xmm3, xmm0
+    movdqa      xmm4, xmm0
+
+    psrldq      xmm1, 1
+    psrldq      xmm6, 6
+    psrldq      xmm7, 7
+    psrldq      xmm2, 2
+    psrldq      xmm5, 5
+    psrldq      xmm3, 3
+    psrldq      xmm4, 4
+
+    APPLY_FILTER_8 0, 8
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+    jnz         .loop
+
+    add rsp, 16 * 10
+    pop rsp
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..e5fafb0302
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,870 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64:    times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+    ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 +
+    ; pmaddubsw has a higher latency on some platforms, this might be eased by
+    ; interleaving the instructions.
+    %define    k0k1  [rsp + 16*0]
+    %define    k2k3  [rsp + 16*1]
+    %define    k4k5  [rsp + 16*2]
+    %define    k6k7  [rsp + 16*3]
+    packsswb     m4, m4
+    ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+    ; some platforms.
+    pshuflw      m0, m4, 0b              ;k0_k1
+    pshuflw      m1, m4, 01010101b       ;k2_k3
+    pshuflw      m2, m4, 10101010b       ;k4_k5
+    pshuflw      m3, m4, 11111111b       ;k6_k7
+    punpcklqdq   m0, m0
+    punpcklqdq   m1, m1
+    punpcklqdq   m2, m2
+    punpcklqdq   m3, m3
+    mova       k0k1, m0
+    mova       k2k3, m1
+    mova       k4k5, m2
+    mova       k6k7, m3
+%if AOM_ARCH_X86_64
+    %define     krd  m12
+    %define    tmp0  [rsp + 16*4]
+    %define    tmp1  [rsp + 16*5]
+    mova        krd, [GLOBAL(pw_64)]
+%else
+    %define     krd  [rsp + 16*4]
+%if CONFIG_PIC=0
+    mova         m6, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb      m6, m6                  ;all ones
+    psrlw        m6, 15
+    psllw        m6, 6                   ;aka pw_64
+%endif
+    mova        krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if AOM_ARCH_X86_64
+  %define LOCAL_VARS_SIZE_H4 0
+%else
+  %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+                            src, sstride, dst, dstride, height, filter
+    mova                m4, [filterq]
+    packsswb            m4, m4
+%if AOM_ARCH_X86_64
+    %define       k0k1k4k5  m8
+    %define       k2k3k6k7  m9
+    %define            krd  m10
+    mova               krd, [GLOBAL(pw_64)]
+    pshuflw       k0k1k4k5, m4, 0b              ;k0_k1
+    pshufhw       k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+    pshuflw       k2k3k6k7, m4, 01010101b       ;k2_k3
+    pshufhw       k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+    %define       k0k1k4k5  [rsp + 16*0]
+    %define       k2k3k6k7  [rsp + 16*1]
+    %define            krd  [rsp + 16*2]
+    pshuflw             m6, m4, 0b              ;k0_k1
+    pshufhw             m6, m6, 10101010b       ;k0_k1_k4_k5
+    pshuflw             m7, m4, 01010101b       ;k2_k3
+    pshufhw             m7, m7, 11111111b       ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+    mova                m1, [GLOBAL(pw_64)]
+%else
+    ; build constants without accessing global memory
+    pcmpeqb             m1, m1                  ;all ones
+    psrlw               m1, 15
+    psllw               m1, 6                   ;aka pw_64
+%endif
+    mova          k0k1k4k5, m6
+    mova          k2k3k6k7, m7
+    mova               krd, m1
+%endif
+    dec            heightd
+
+.loop:
+    ;Do two rows at once
+    movu                m4, [srcq - 3]
+    movu                m5, [srcq + sstrideq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    punpckhbw           m3, m5, m5
+    punpcklbw           m5, m5
+    palignr             m0, m1, m4, 1
+    pmaddubsw           m0, k0k1k4k5
+    palignr             m1, m4, 5
+    pmaddubsw           m1, k2k3k6k7
+    palignr             m2, m3, m5, 1
+    pmaddubsw           m2, k0k1k4k5
+    palignr             m3, m5, 5
+    pmaddubsw           m3, k2k3k6k7
+    punpckhqdq          m4, m0, m2
+    punpcklqdq          m0, m2
+    punpckhqdq          m5, m1, m3
+    punpcklqdq          m1, m3
+    paddsw              m0, m4
+    paddsw              m1, m5
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    movd                m5, [dstq + dstrideq]
+%endif
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+    punpcklbw            m4, m3
+    paddsw               m0, m4
+%endif
+    packuswb            m0, m0
+    psrldq              m1, m0, 4
+
+%ifidn %1, h8_avg
+    pavgb               m0, m4
+    pavgb               m1, m5
+%endif
+    movd            [dstq], m0
+    movd [dstq + dstrideq], m1
+
+    lea               srcq, [srcq + sstrideq        ]
+    prefetcht0              [srcq + 4 * sstrideq - 3]
+    lea               srcq, [srcq + sstrideq        ]
+    lea               dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0              [srcq + 2 * sstrideq - 3]
+
+    sub            heightd, 2
+    jg               .loop
+
+    ; Do last row if output_height is odd
+    jne              .done
+
+    movu                m4, [srcq - 3]
+    punpckhbw           m1, m4, m4
+    punpcklbw           m4, m4
+    palignr             m0, m1, m4, 1
+    palignr             m1, m4, 5
+    pmaddubsw           m0, k0k1k4k5
+    pmaddubsw           m1, k2k3k6k7
+    psrldq              m2, m0, 8
+    psrldq              m3, m1, 8
+    paddsw              m0, m2
+    paddsw              m1, m3
+    paddsw              m0, m1
+    paddsw              m0, krd
+    psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                m3, m3
+    movu                m4, [srcq]
+    punpcklbw           m4, m3
+    paddsw              m0, m4
+%endif
+    packuswb            m0, m0
+%ifidn %1, h8_avg
+    movd                m4, [dstq]
+    pavgb               m0, m4
+%endif
+    movd            [dstq], m0
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                            src, sstride, dst, dstride, height, filter
+    mova                 m4, [filterq]
+    SETUP_LOCAL_VARS
+    dec             heightd
+
+.loop:
+    ;Do two rows at once
+    movu                 m0, [srcq - 3]
+    movu                 m4, [srcq + sstrideq - 3]
+    punpckhbw            m1, m0, m0
+    punpcklbw            m0, m0
+    palignr              m5, m1, m0, 13
+    pmaddubsw            m5, k6k7
+    palignr              m2, m1, m0, 5
+    palignr              m3, m1, m0, 9
+    palignr              m1, m0, 1
+    pmaddubsw            m1, k0k1
+    punpckhbw            m6, m4, m4
+    punpcklbw            m4, m4
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+
+    palignr              m7, m6, m4, 13
+    palignr              m0, m6, m4, 5
+    pmaddubsw            m7, k6k7
+    paddsw               m1, m3
+    paddsw               m2, m5
+    paddsw               m1, m2
+%ifidn %1, h8_avg
+    movh                 m2, [dstq]
+    movhps               m2, [dstq + dstrideq]
+%endif
+    palignr              m5, m6, m4, 9
+    palignr              m6, m4, 1
+    pmaddubsw            m0, k2k3
+    pmaddubsw            m6, k0k1
+    paddsw               m1, krd
+    pmaddubsw            m5, k4k5
+    psraw                m1, 7
+    paddsw               m0, m7
+    paddsw               m6, m5
+    paddsw               m6, m0
+    paddsw               m6, krd
+    psraw                m6, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpcklbw            m4, m3
+    punpcklbw            m5, m3
+    paddsw               m1, m4
+    paddsw               m6, m5
+%endif
+    packuswb             m1, m6
+%ifidn %1, h8_avg
+    pavgb                m1, m2
+%endif
+    movh              [dstq], m1
+    movhps [dstq + dstrideq], m1
+
+    lea                srcq, [srcq + sstrideq        ]
+    prefetcht0               [srcq + 4 * sstrideq - 3]
+    lea                srcq, [srcq + sstrideq        ]
+    lea                dstq, [dstq + 2 * dstrideq    ]
+    prefetcht0               [srcq + 2 * sstrideq - 3]
+    sub             heightd, 2
+    jg                .loop
+
+    ; Do last row if output_height is odd
+    jne               .done
+
+    movu                 m0, [srcq - 3]
+    punpckhbw            m3, m0, m0
+    punpcklbw            m0, m0
+    palignr              m1, m3, m0, 1
+    palignr              m2, m3, m0, 5
+    palignr              m4, m3, m0, 13
+    palignr              m3, m0, 9
+    pmaddubsw            m1, k0k1
+    pmaddubsw            m2, k2k3
+    pmaddubsw            m3, k4k5
+    pmaddubsw            m4, k6k7
+    paddsw               m1, m3
+    paddsw               m4, m2
+    paddsw               m1, m4
+    paddsw               m1, krd
+    psraw                m1, 7
+%ifidn %1, h8_add_src
+    pxor                 m6, m6
+    movu                 m5, [srcq]
+    punpcklbw            m5, m6
+    paddsw               m1, m5
+%endif
+    packuswb             m1, m1
+%ifidn %1, h8_avg
+    movh                 m0, [dstq]
+    pavgb                m1, m0
+%endif
+    movh             [dstq], m1
+.done:
+    REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+.loop:
+    prefetcht0        [srcq + 2 * sstrideq -3]
+
+    movu          m0, [srcq - 3]
+    movu          m4, [srcq - 2]
+    pmaddubsw     m0, k0k1
+    pmaddubsw     m4, k0k1
+    movu          m1, [srcq - 1]
+    movu          m5, [srcq + 0]
+    pmaddubsw     m1, k2k3
+    pmaddubsw     m5, k2k3
+    movu          m2, [srcq + 1]
+    movu          m6, [srcq + 2]
+    pmaddubsw     m2, k4k5
+    pmaddubsw     m6, k4k5
+    movu          m3, [srcq + 3]
+    movu          m7, [srcq + 4]
+    pmaddubsw     m3, k6k7
+    pmaddubsw     m7, k6k7
+    paddsw        m0, m2
+    paddsw        m1, m3
+    paddsw        m0, m1
+    paddsw        m4, m6
+    paddsw        m5, m7
+    paddsw        m4, m5
+    paddsw        m0, krd
+    paddsw        m4, krd
+    psraw         m0, 7
+    psraw         m4, 7
+%ifidn %1, h8_add_src
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+    pcmpeqb       m2, m2                  ;all ones
+    psrlw         m2, 8                   ;even_byte_mask
+%else
+    mova          m2, [GLOBAL(even_byte_mask)]
+%endif
+    movu          m5, [srcq]
+    mova          m7, m5
+    pand          m5, m2
+    psrlw         m7, 8
+    paddsw        m0, m5
+    paddsw        m4, m7
+%endif
+    packuswb      m0, m0
+    packuswb      m4, m4
+    punpcklbw     m0, m4
+%ifidn %1, h8_avg
+    pavgb         m0, [dstq]
+%endif
+    lea         srcq, [srcq + sstrideq]
+    mova      [dstq], m0
+    lea         dstq, [dstq + dstrideq]
+    dec      heightd
+    jnz        .loop
+    REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER8  h8
+SUBPIX_HFILTER4  h8
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+    %define NUM_GENERAL_REG_USED 9
+%else
+    %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova          m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+    %define                movx  movh
+%else
+    %define                movx  movd
+%endif
+
+    dec                 heightd
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    mov                   src1q, srcq
+    add                   src1q, sstrideq
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    ;Do two rows at once
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [src1q               ]     ;B
+    punpcklbw                m0, m1                         ;A B
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    pmaddubsw                m0, k0k1
+    mova                     m6, m2
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    punpcklbw                m2, m3                         ;C D
+    pmaddubsw                m2, k2k3
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    mova                     m7, m4
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m4, k4k5
+    punpcklbw                m1, m6                         ;A B next iter
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m5, m6                         ;E F next iter
+    punpcklbw                m3, m7                         ;C D next iter
+    pmaddubsw                m5, k4k5
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m3, k2k3
+    pmaddubsw                m1, k0k1
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    movx                     m6, [srcq + sstrideq * 8 ]     ;H next iter
+    punpcklbw                m7, m6
+    pmaddubsw                m7, k6k7
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    paddsw                   m1, m5
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+
+    paddsw                   m3, m7
+    paddsw                   m1, m3
+    paddsw                   m1, krd
+    psraw                    m1, 7
+%ifidn %1, v8_add_src
+    movu                     m4, [src1q]
+    punpcklbw                m4, m6
+    paddsw                   m1, m4
+%endif
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    lea                   src1q, [src1q + sstrideq * 2]
+    packuswb                 m1, m1
+
+%ifidn %1, v8_avg
+    movx                     m2, [dstq]
+    pavgb                    m0, m2
+%endif
+    movx                 [dstq], m0
+    add                    dstq, dst_stride
+%ifidn %1, v8_avg
+    movx                     m3, [dstq]
+    pavgb                    m1, m3
+%endif
+    movx                 [dstq], m1
+    add                    dstq, dst_stride
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    movx                     m6, [srcq + sstride6q    ]     ;G
+    punpcklbw                m0, m1                         ;A B
+    movx                     m7, [src1q + sstride6q   ]     ;H
+    pmaddubsw                m0, k0k1
+    movx                     m2, [srcq + sstrideq * 2 ]     ;C
+    punpcklbw                m6, m7                         ;G H
+    movx                     m3, [src1q + sstrideq * 2]     ;D
+    pmaddubsw                m6, k6k7
+    movx                     m4, [srcq + sstrideq * 4 ]     ;E
+    punpcklbw                m2, m3                         ;C D
+    movx                     m5, [src1q + sstrideq * 4]     ;F
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    paddsw                   m2, m6
+    paddsw                   m0, m4
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%else
+    ; AOM_ARCH_X86_64
+
+    movx                     m0, [srcq                ]     ;A
+    movx                     m1, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m2, [srcq]                     ;C
+    movx                     m3, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m4, [srcq]                     ;E
+    movx                     m5, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                     m6, [srcq]                     ;G
+    punpcklbw                m0, m1                         ;A B
+    punpcklbw                m1, m2                         ;A B next iter
+    punpcklbw                m2, m3                         ;C D
+    punpcklbw                m3, m4                         ;C D next iter
+    punpcklbw                m4, m5                         ;E F
+    punpcklbw                m5, m6                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    movx                     m7, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2 ]
+    movx                    m14, [srcq]                     ;H next iter
+    punpcklbw                m6, m7                         ;G H
+    punpcklbw                m7, m14                        ;G H next iter
+    pmaddubsw                m8, m0, k0k1
+    pmaddubsw                m9, m1, k0k1
+    mova                     m0, m2
+    mova                     m1, m3
+    pmaddubsw               m10, m2, k2k3
+    pmaddubsw               m11, m3, k2k3
+    mova                     m2, m4
+    mova                     m3, m5
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m5, k4k5
+    paddsw                   m8, m4
+    paddsw                   m9, m5
+    mova                     m4, m6
+    mova                     m5, m7
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m7, k6k7
+    paddsw                  m10, m6
+    paddsw                  m11, m7
+    paddsw                   m8, m10
+    paddsw                   m9, m11
+    mova                     m6, m14
+    paddsw                   m8, krd
+    paddsw                   m9, krd
+    psraw                    m8, 7
+    psraw                    m9, 7
+%ifidn %2, 4
+    packuswb                 m8, m8
+    packuswb                 m9, m9
+%else
+    packuswb                 m8, m9
+%endif
+
+%ifidn %1, v8_avg
+    movx                     m7, [dstq]
+%ifidn %2, 4
+    movx                    m10, [dstq + dstrideq]
+    pavgb                    m9, m10
+%else
+    movhpd                   m7, [dstq + dstrideq]
+%endif
+    pavgb                    m8, m7
+%endif
+    movx                 [dstq], m8
+%ifidn %2, 4
+    movx      [dstq + dstrideq], m9
+%else
+    movhpd    [dstq + dstrideq], m8
+%endif
+
+    lea                    dstq, [dstq + dstrideq * 2 ]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movx                     m7, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m7                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m2, k2k3
+    pmaddubsw                m4, k4k5
+    pmaddubsw                m6, k6k7
+    paddsw                   m0, m4
+    paddsw                   m2, m6
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    psraw                    m0, 7
+    packuswb                 m0, m0
+%ifidn %1, v8_avg
+    movx                     m1, [dstq]
+    pavgb                    m0, m1
+%endif
+    movx                 [dstq], m0
+
+%endif ; AOM_ARCH_X86_64
+
+.done:
+    REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+                             src, sstride, dst, dstride, height, filter
+    mova                     m4, [filterq]
+    SETUP_LOCAL_VARS
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+    %define               src1q  r7
+    %define           sstride6q  r8
+    %define          dst_stride  dstrideq
+%else
+    %define               src1q  filterq
+    %define           sstride6q  dstrideq
+    %define          dst_stride  dstridemp
+%endif
+    lea                   src1q, [srcq + sstrideq]
+    lea               sstride6q, [sstrideq + sstrideq * 4]
+    add               sstride6q, sstrideq                   ;pitch * 6
+
+.loop:
+    movh                     m0, [srcq                ]     ;A
+    movh                     m1, [src1q               ]     ;B
+    movh                     m2, [srcq + sstrideq * 2 ]     ;C
+    movh                     m3, [src1q + sstrideq * 2]     ;D
+    movh                     m4, [srcq + sstrideq * 4 ]     ;E
+    movh                     m5, [src1q + sstrideq * 4]     ;F
+
+    punpcklbw                m0, m1                         ;A B
+    movh                     m6, [srcq + sstride6q]         ;G
+    punpcklbw                m2, m3                         ;C D
+    movh                     m7, [src1q + sstride6q]        ;H
+    punpcklbw                m4, m5                         ;E F
+    pmaddubsw                m0, k0k1
+    movh                     m3, [srcq + 8]                 ;A
+    pmaddubsw                m2, k2k3
+    punpcklbw                m6, m7                         ;G H
+    movh                     m5, [srcq + sstrideq + 8]      ;B
+    pmaddubsw                m4, k4k5
+    punpcklbw                m3, m5                         ;A B
+    movh                     m7, [srcq + sstrideq * 2 + 8]  ;C
+    pmaddubsw                m6, k6k7
+    movh                     m5, [src1q + sstrideq * 2 + 8] ;D
+    punpcklbw                m7, m5                         ;C D
+    paddsw                   m2, m6
+    pmaddubsw                m3, k0k1
+    movh                     m1, [srcq + sstrideq * 4 + 8]  ;E
+    paddsw                   m0, m4
+    pmaddubsw                m7, k2k3
+    movh                     m6, [src1q + sstrideq * 4 + 8] ;F
+    punpcklbw                m1, m6                         ;E F
+    paddsw                   m0, m2
+    paddsw                   m0, krd
+    movh                     m2, [srcq + sstride6q + 8]     ;G
+    pmaddubsw                m1, k4k5
+    movh                     m5, [src1q + sstride6q + 8]    ;H
+    psraw                    m0, 7
+    punpcklbw                m2, m5                         ;G H
+    pmaddubsw                m2, k6k7
+    paddsw                   m7, m2
+    paddsw                   m3, m1
+    paddsw                   m3, m7
+    paddsw                   m3, krd
+    psraw                    m3, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+    mova                     m5, m4
+    punpcklbw                m4, m6
+    punpckhbw                m5, m6
+    paddsw                   m0, m4
+    paddsw                   m3, m5
+%endif
+    packuswb                 m0, m3
+
+    add                    srcq, sstrideq
+    add                   src1q, sstrideq
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+    add                    dstq, dst_stride
+    dec                 heightd
+    jnz                   .loop
+    REP_RET
+
+%else
+    ; AOM_ARCH_X86_64
+    dec                 heightd
+
+    movu                     m1, [srcq                ]     ;A
+    movu                     m3, [srcq + sstrideq     ]     ;B
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m0, m1, m3                     ;A B
+    punpckhbw                m1, m3                         ;A B
+    movu                     m5, [srcq]                     ;C
+    punpcklbw                m2, m3, m5                     ;A B next iter
+    punpckhbw                m3, m5                         ;A B next iter
+    mova                   tmp0, m2                         ;store to stack
+    mova                   tmp1, m3                         ;store to stack
+    movu                     m7, [srcq + sstrideq]          ;D
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m4, m5, m7                     ;C D
+    punpckhbw                m5, m7                         ;C D
+    movu                     m9, [srcq]                     ;E
+    punpcklbw                m6, m7, m9                     ;C D next iter
+    punpckhbw                m7, m9                         ;C D next iter
+    movu                    m11, [srcq + sstrideq]          ;F
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw                m8, m9, m11                    ;E F
+    punpckhbw                m9, m11                        ;E F
+    movu                     m2, [srcq]                     ;G
+    punpcklbw               m10, m11, m2                    ;E F next iter
+    punpckhbw               m11, m2                         ;E F next iter
+
+.loop:
+    ;Do two rows at once
+    pmaddubsw               m13, m0, k0k1
+    mova                     m0, m4
+    pmaddubsw               m14, m8, k4k5
+    pmaddubsw               m15, m4, k2k3
+    mova                     m4, m8
+    paddsw                  m13, m14
+    movu                     m3, [srcq + sstrideq]          ;H
+    lea                    srcq, [srcq + sstrideq * 2]
+    punpcklbw               m14, m2, m3                     ;G H
+    mova                     m8, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m15, m14
+    paddsw                  m13, m15
+    paddsw                  m13, krd
+    psraw                   m13, 7
+
+    pmaddubsw               m14, m1, k0k1
+    pmaddubsw                m1, m9, k4k5
+    pmaddubsw               m15, m5, k2k3
+    paddsw                  m14, m1
+    mova                     m1, m5
+    mova                     m5, m9
+    punpckhbw                m2, m3                         ;G H
+    mova                     m9, m2
+    pmaddubsw                m2, k6k7
+    paddsw                  m15, m2
+    paddsw                  m14, m15
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m13, m14
+%ifidn %1, v8_avg
+    pavgb                   m13, [dstq]
+%endif
+    mova                 [dstq], m13
+
+    ; next iter
+    pmaddubsw               m15, tmp0, k0k1
+    pmaddubsw               m14, m10, k4k5
+    pmaddubsw               m13, m6, k2k3
+    paddsw                  m15, m14
+    mova                   tmp0, m6
+    mova                     m6, m10
+    movu                     m2, [srcq]                     ;G next iter
+    punpcklbw               m14, m3, m2                     ;G H next iter
+    mova                    m10, m14
+    pmaddubsw               m14, k6k7
+    paddsw                  m13, m14
+    paddsw                  m15, m13
+    paddsw                  m15, krd
+    psraw                   m15, 7
+
+    pmaddubsw               m14, tmp1, k0k1
+    mova                   tmp1, m7
+    pmaddubsw               m13, m7, k2k3
+    mova                     m7, m11
+    pmaddubsw               m11, k4k5
+    paddsw                  m14, m11
+    punpckhbw                m3, m2                         ;G H next iter
+    mova                    m11, m3
+    pmaddubsw                m3, k6k7
+    paddsw                  m13, m3
+    paddsw                  m14, m13
+    paddsw                  m14, krd
+    psraw                   m14, 7
+    packuswb                m15, m14
+%ifidn %1, v8_avg
+    pavgb                   m15, [dstq + dstrideq]
+%endif
+    mova      [dstq + dstrideq], m15
+    lea                    dstq, [dstq + dstrideq * 2]
+    sub                 heightd, 2
+    jg                    .loop
+
+    ; Do last row if output_height is odd
+    jne                   .done
+
+    movu                     m3, [srcq + sstrideq]          ;H
+    punpcklbw                m6, m2, m3                     ;G H
+    punpckhbw                m2, m3                         ;G H
+    pmaddubsw                m0, k0k1
+    pmaddubsw                m1, k0k1
+    pmaddubsw                m4, k2k3
+    pmaddubsw                m5, k2k3
+    pmaddubsw                m8, k4k5
+    pmaddubsw                m9, k4k5
+    pmaddubsw                m6, k6k7
+    pmaddubsw                m2, k6k7
+    paddsw                   m0, m8
+    paddsw                   m1, m9
+    paddsw                   m4, m6
+    paddsw                   m5, m2
+    paddsw                   m0, m4
+    paddsw                   m1, m5
+    paddsw                   m0, krd
+    paddsw                   m1, krd
+    psraw                    m0, 7
+    psraw                    m1, 7
+    packuswb                 m0, m1
+%ifidn %1, v8_avg
+    pavgb                    m0, [dstq]
+%endif
+    mova                 [dstq], m0
+
+.done:
+    REP_RET
+
+%endif ; AOM_ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16     v8
+SUBPIX_VFILTER       v8, 8
+SUBPIX_VFILTER       v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..90dd55a4be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    pshuflw     xmm4, xmm3, 11111111b       ;k3
+    psrldq      xmm3, 8
+    pshuflw     xmm3, xmm3, 0b              ;k4
+    punpcklqdq  xmm4, xmm3                  ;k3k4
+
+    movq        xmm3, rcx                   ;rounding
+    pshufd      xmm3, xmm3, 0
+
+    pxor        xmm2, xmm2
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+    punpckldq   xmm0, xmm1                  ;two row in one register
+    punpcklbw   xmm0, xmm2                  ;unpack to word
+    pmullw      xmm0, xmm4                  ;multiply the filter factors
+
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 8
+    paddsw      xmm0, xmm1
+
+    paddsw      xmm0, xmm3                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         rcx, 0x0400040
+
+    movdqa      xmm7, [rdx]                 ;load filters
+
+    pshuflw     xmm6, xmm7, 11111111b       ;k3
+    pshufhw     xmm7, xmm7, 0b              ;k4
+    punpcklwd   xmm6, xmm6
+    punpckhwd   xmm7, xmm7
+
+    movq        xmm4, rcx                   ;rounding
+    pshufd      xmm4, xmm4, 0
+
+    pxor        xmm5, xmm5
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    paddsw      xmm0, xmm1
+    paddsw      xmm0, xmm4                  ;rounding
+    psraw       xmm0, 7                     ;shift
+    packuswb    xmm0, xmm0                  ;pack back to byte
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm5
+    punpcklbw   xmm1, xmm5
+    punpckhbw   xmm2, xmm5
+    punpckhbw   xmm3, xmm5
+
+    pmullw      xmm0, xmm6
+    pmullw      xmm1, xmm7
+    pmullw      xmm2, xmm6
+    pmullw      xmm3, xmm7
+
+    paddsw      xmm0, xmm1
+    paddsw      xmm2, xmm3
+
+    paddsw      xmm0, xmm4                  ;rounding
+    paddsw      xmm2, xmm4
+    psraw       xmm0, 7                     ;shift
+    psraw       xmm2, 7
+    packuswb    xmm0, xmm2                  ;pack back to byte
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_sse2)
+sym(aom_filter_block1d4_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d8_v2_sse2)
+sym(aom_filter_block1d8_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d16_v2_sse2)
+sym(aom_filter_block1d16_v2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+    movdqa        xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d4_h2_sse2)
+sym(aom_filter_block1d4_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d8_h2_sse2)
+sym(aom_filter_block1d8_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d16_h2_sse2)
+sym(aom_filter_block1d16_h2_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+    movdqa      xmm3, xmm1
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..253bc26d38
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,267 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm3, [rdx]                 ;load filters
+    psrldq      xmm3, 6
+    packsswb    xmm3, xmm3
+    pshuflw     xmm3, xmm3, 0b              ;k3_k4
+
+    movd        xmm2, ecx                   ;rounding_shift
+    pshufd      xmm2, xmm2, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm3
+
+    pmulhrsw    xmm0, xmm2                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack to byte
+
+%if %1
+    movd        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movd        [rdi], xmm0
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro GET_PARAM 0
+    mov         rdx, arg(5)                 ;filter ptr
+    mov         rsi, arg(0)                 ;src_ptr
+    mov         rdi, arg(2)                 ;output_ptr
+    mov         ecx, 0x01000100
+
+    movdqa      xmm7, [rdx]                 ;load filters
+    psrldq      xmm7, 6
+    packsswb    xmm7, xmm7
+    pshuflw     xmm7, xmm7, 0b              ;k3_k4
+    punpcklwd   xmm7, xmm7
+
+    movd        xmm6, ecx                   ;rounding_shift
+    pshufd      xmm6, xmm6, 0
+
+    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
+    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
+    movsxd      rcx, DWORD PTR arg(4)       ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+    punpcklbw   xmm0, xmm1
+    pmaddubsw   xmm0, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    packuswb    xmm0, xmm0                  ;pack back to byte
+
+%if %1
+    movq        xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movq        [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+    punpcklbw   xmm0, xmm1
+    punpckhbw   xmm2, xmm1
+    pmaddubsw   xmm0, xmm7
+    pmaddubsw   xmm2, xmm7
+
+    pmulhrsw    xmm0, xmm6                  ;rounding(+64)+shift(>>7)
+    pmulhrsw    xmm2, xmm6
+    packuswb    xmm0, xmm2                  ;pack back to byte
+
+%if %1
+    movdqu      xmm1, [rdi]
+    pavgb       xmm0, xmm1
+%endif
+    movdqu      [rdi], xmm0                 ;store the result
+
+    lea         rsi, [rsi + rax]
+    lea         rdi, [rdi + rdx]
+    dec         rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_ssse3)
+sym(aom_filter_block1d4_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movd        xmm0, [rsi]                 ;load src
+    movd        xmm1, [rsi + rax]
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d8_v2_ssse3)
+sym(aom_filter_block1d8_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movq        xmm0, [rsi]                 ;0
+    movq        xmm1, [rsi + rax]           ;1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d16_v2_ssse3)
+sym(aom_filter_block1d16_v2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu        xmm0, [rsi]               ;0
+    movdqu        xmm1, [rsi + rax]         ;1
+    movdqa        xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d4_h2_ssse3)
+sym(aom_filter_block1d4_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM_4
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_4 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d8_h2_ssse3)
+sym(aom_filter_block1d8_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0, [rsi]                 ;load src
+    movdqa      xmm1, xmm0
+    psrldq      xmm1, 1
+
+    APPLY_FILTER_8 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+globalsym(aom_filter_block1d16_h2_ssse3)
+sym(aom_filter_block1d16_h2_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    GET_PARAM
+.loop:
+    movdqu      xmm0,   [rsi]               ;load src
+    movdqu      xmm1,   [rsi + 1]
+    movdqa      xmm2, xmm0
+
+    APPLY_FILTER_16 0
+    jnz         .loop
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..49fcd72098
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+                                                   __m256i *out_lo,
+                                                   __m256i *out_hi) {
+  const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+  *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi16(a0, a1);
+  __m256i b1 = _mm256_sub_epi16(a0, a1);
+  __m256i b2 = _mm256_add_epi16(a2, a3);
+  __m256i b3 = _mm256_sub_epi16(a2, a3);
+  __m256i b4 = _mm256_add_epi16(a4, a5);
+  __m256i b5 = _mm256_sub_epi16(a4, a5);
+  __m256i b6 = _mm256_add_epi16(a6, a7);
+  __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+  a0 = _mm256_add_epi16(b0, b2);
+  a1 = _mm256_add_epi16(b1, b3);
+  a2 = _mm256_sub_epi16(b0, b2);
+  a3 = _mm256_sub_epi16(b1, b3);
+  a4 = _mm256_add_epi16(b4, b6);
+  a5 = _mm256_add_epi16(b5, b7);
+  a6 = _mm256_sub_epi16(b4, b6);
+  a7 = _mm256_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi16(a0, a4);
+    b7 = _mm256_add_epi16(a1, a5);
+    b3 = _mm256_add_epi16(a2, a6);
+    b4 = _mm256_add_epi16(a3, a7);
+    b2 = _mm256_sub_epi16(a0, a4);
+    b6 = _mm256_sub_epi16(a1, a5);
+    b1 = _mm256_sub_epi16(a2, a6);
+    b5 = _mm256_sub_epi16(a3, a7);
+
+    a0 = _mm256_unpacklo_epi16(b0, b1);
+    a1 = _mm256_unpacklo_epi16(b2, b3);
+    a2 = _mm256_unpackhi_epi16(b0, b1);
+    a3 = _mm256_unpackhi_epi16(b2, b3);
+    a4 = _mm256_unpacklo_epi16(b4, b5);
+    a5 = _mm256_unpacklo_epi16(b6, b7);
+    a6 = _mm256_unpackhi_epi16(b4, b5);
+    a7 = _mm256_unpackhi_epi16(b6, b7);
+
+    b0 = _mm256_unpacklo_epi32(a0, a1);
+    b1 = _mm256_unpacklo_epi32(a4, a5);
+    b2 = _mm256_unpackhi_epi32(a0, a1);
+    b3 = _mm256_unpackhi_epi32(a4, a5);
+    b4 = _mm256_unpacklo_epi32(a2, a3);
+    b5 = _mm256_unpacklo_epi32(a6, a7);
+    b6 = _mm256_unpackhi_epi32(a2, a3);
+    b7 = _mm256_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm256_unpacklo_epi64(b0, b1);
+    in[1] = _mm256_unpackhi_epi64(b0, b1);
+    in[2] = _mm256_unpacklo_epi64(b2, b3);
+    in[3] = _mm256_unpackhi_epi64(b2, b3);
+    in[4] = _mm256_unpacklo_epi64(b4, b5);
+    in[5] = _mm256_unpackhi_epi64(b4, b5);
+    in[6] = _mm256_unpacklo_epi64(b6, b7);
+    in[7] = _mm256_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm256_add_epi16(a0, a4);
+    in[7] = _mm256_add_epi16(a1, a5);
+    in[3] = _mm256_add_epi16(a2, a6);
+    in[4] = _mm256_add_epi16(a3, a7);
+    in[2] = _mm256_sub_epi16(a0, a4);
+    in[6] = _mm256_sub_epi16(a1, a5);
+    in[1] = _mm256_sub_epi16(a2, a6);
+    in[5] = _mm256_sub_epi16(a3, a7);
+  }
+}
+
+void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff,
+                                   ptrdiff_t src_stride, int16_t *coeff) {
+  __m256i src[8];
+  src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+  src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+  src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+  hadamard_col8x2_avx2(src, 0);
+  hadamard_col8x2_avx2(src, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x20));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[0], src[1], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[2], src[3], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[4], src[5], 0x31));
+  coeff += 16;
+  _mm256_storeu_si256((__m256i *)coeff,
+                      _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+                                  t_coeff + (idx * 64 * 2));
+  }
+
+  for (idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    if (is_final) {
+      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+      coeff += 16;
+    } else {
+      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+      coeff16 += 16;
+    }
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 2; ++idx) {
+    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+    aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+                                  t_coeff + (idx * 64 * 2));
+  }
+
+  for (int idx = 0; idx < 64; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi16(b0, 1);
+    b1 = _mm256_srai_epi16(b1, 1);
+    b2 = _mm256_srai_epi16(b2, 1);
+    b3 = _mm256_srai_epi16(b3, 1);
+    _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+    _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+    _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m256i b0, b1, b2, b3;
+  const __m256i zero = _mm256_setzero_si256();
+  for (idx = 0; idx < 4; ++idx) {
+    // src_diff: 9 bit, dynamic range [-255, 255]
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_avx2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 16) {
+    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm256_srai_epi32(b0_lo, 2);
+    b1_lo = _mm256_srai_epi32(b1_lo, 2);
+    b2_lo = _mm256_srai_epi32(b2_lo, 2);
+    b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm256_srai_epi32(b0_hi, 2);
+    b1_hi = _mm256_srai_epi32(b1_hi, 2);
+    b2_hi = _mm256_srai_epi32(b2_hi, 2);
+    b3_hi = _mm256_srai_epi32(b3_hi, 2);
+
+    b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm256_packs_epi32(b3_lo, b3_hi);
+
+    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+    coeff += 16;
+    t_coeff += 16;
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+  __m256i a0 = in[0];
+  __m256i a1 = in[1];
+  __m256i a2 = in[2];
+  __m256i a3 = in[3];
+  __m256i a4 = in[4];
+  __m256i a5 = in[5];
+  __m256i a6 = in[6];
+  __m256i a7 = in[7];
+
+  __m256i b0 = _mm256_add_epi32(a0, a1);
+  __m256i b1 = _mm256_sub_epi32(a0, a1);
+  __m256i b2 = _mm256_add_epi32(a2, a3);
+  __m256i b3 = _mm256_sub_epi32(a2, a3);
+  __m256i b4 = _mm256_add_epi32(a4, a5);
+  __m256i b5 = _mm256_sub_epi32(a4, a5);
+  __m256i b6 = _mm256_add_epi32(a6, a7);
+  __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+  a0 = _mm256_add_epi32(b0, b2);
+  a1 = _mm256_add_epi32(b1, b3);
+  a2 = _mm256_sub_epi32(b0, b2);
+  a3 = _mm256_sub_epi32(b1, b3);
+  a4 = _mm256_add_epi32(b4, b6);
+  a5 = _mm256_add_epi32(b5, b7);
+  a6 = _mm256_sub_epi32(b4, b6);
+  a7 = _mm256_sub_epi32(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm256_add_epi32(a0, a4);
+    b7 = _mm256_add_epi32(a1, a5);
+    b3 = _mm256_add_epi32(a2, a6);
+    b4 = _mm256_add_epi32(a3, a7);
+    b2 = _mm256_sub_epi32(a0, a4);
+    b6 = _mm256_sub_epi32(a1, a5);
+    b1 = _mm256_sub_epi32(a2, a6);
+    b5 = _mm256_sub_epi32(a3, a7);
+
+    a0 = _mm256_unpacklo_epi32(b0, b1);
+    a1 = _mm256_unpacklo_epi32(b2, b3);
+    a2 = _mm256_unpackhi_epi32(b0, b1);
+    a3 = _mm256_unpackhi_epi32(b2, b3);
+    a4 = _mm256_unpacklo_epi32(b4, b5);
+    a5 = _mm256_unpacklo_epi32(b6, b7);
+    a6 = _mm256_unpackhi_epi32(b4, b5);
+    a7 = _mm256_unpackhi_epi32(b6, b7);
+
+    b0 = _mm256_unpacklo_epi64(a0, a1);
+    b1 = _mm256_unpacklo_epi64(a4, a5);
+    b2 = _mm256_unpackhi_epi64(a0, a1);
+    b3 = _mm256_unpackhi_epi64(a4, a5);
+    b4 = _mm256_unpacklo_epi64(a2, a3);
+    b5 = _mm256_unpacklo_epi64(a6, a7);
+    b6 = _mm256_unpackhi_epi64(a2, a3);
+    b7 = _mm256_unpackhi_epi64(a6, a7);
+
+    in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+    in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+    in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+    in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+    in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+    in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+    in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+    in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+  } else {
+    in[0] = _mm256_add_epi32(a0, a4);
+    in[7] = _mm256_add_epi32(a1, a5);
+    in[3] = _mm256_add_epi32(a2, a6);
+    in[4] = _mm256_add_epi32(a3, a7);
+    in[2] = _mm256_sub_epi32(a0, a4);
+    in[6] = _mm256_sub_epi32(a1, a5);
+    in[1] = _mm256_sub_epi32(a2, a6);
+    in[5] = _mm256_sub_epi32(a3, a7);
+  }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                  tran_low_t *coeff) {
+  __m128i src16[8];
+  __m256i src32[8];
+
+  src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+  src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+  src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+  src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+  src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+  src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+  src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+  src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+  src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+  src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+  src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+  highbd_hadamard_col8_avx2(src32, 0);
+  highbd_hadamard_col8_avx2(src32, 1);
+
+  _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+  coeff += 8;
+  _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 1);
+    b1 = _mm256_srai_epi32(b1, 1);
+    b2 = _mm256_srai_epi32(b2, 1);
+    b3 = _mm256_srai_epi32(b3, 1);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+                                    ptrdiff_t src_stride, tran_low_t *coeff) {
+  int idx;
+  tran_low_t *t_coeff = coeff;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+    __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+    __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+    __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+    __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+    __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+    __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+    __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+    b0 = _mm256_srai_epi32(b0, 2);
+    b1 = _mm256_srai_epi32(b1, 2);
+    b2 = _mm256_srai_epi32(b2, 2);
+    b3 = _mm256_srai_epi32(b3, 2);
+
+    coeff0 = _mm256_add_epi32(b0, b2);
+    coeff1 = _mm256_add_epi32(b1, b3);
+    coeff2 = _mm256_sub_epi32(b0, b2);
+    coeff3 = _mm256_sub_epi32(b1, b3);
+
+    _mm256_storeu_si256((__m256i *)coeff, coeff0);
+    _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+    _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+    _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+    coeff += 8;
+    t_coeff += 8;
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+int aom_satd_avx2(const tran_low_t *coeff, int length) {
+  __m256i accum = _mm256_setzero_si256();
+  int i;
+
+  for (i = 0; i < length; i += 8, coeff += 8) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi32(src_line);
+    accum = _mm256_add_epi32(accum, abs);
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+  const __m256i one = _mm256_set1_epi16(1);
+  __m256i accum = _mm256_setzero_si256();
+
+  for (int i = 0; i < length; i += 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i abs = _mm256_abs_epi16(src_line);
+    const __m256i sum = _mm256_madd_epi16(abs, one);
+    accum = _mm256_add_epi32(accum, sum);
+    coeff += 16;
+  }
+
+  {  // 32 bit horizontal add
+    const __m256i a = _mm256_srli_si256(accum, 8);
+    const __m256i b = _mm256_add_epi32(accum, a);
+    const __m256i c = _mm256_srli_epi64(b, 32);
+    const __m256i d = _mm256_add_epi32(b, c);
+    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+                                            _mm256_extractf128_si256(d, 1));
+    return _mm_cvtsi128_si32(accum_128);
+  }
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
+}
+
+void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+  const uint8_t *s_y1 = s_y0 + 8 * p;
+  __m256i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm256_setzero_si256();
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+  sum0 = _mm256_add_epi16(s0, s1);
+  sum1 = _mm256_add_epi16(s2, s3);
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+  sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+  sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+  sum0 = _mm256_add_epi16(sum0, sum1);
+
+  // (avg + 32) >> 6
+  __m256i rounding = _mm256_set1_epi32(32);
+  sum0 = _mm256_add_epi32(sum0, rounding);
+  sum0 = _mm256_srli_epi32(sum0, 6);
+  __m128i lo = _mm256_castsi256_si128(sum0);
+  __m128i hi = _mm256_extracti128_si256(sum0, 1);
+  avg[0] = _mm_cvtsi128_si32(lo);
+  avg[1] = _mm_extract_epi32(lo, 2);
+  avg[2] = _mm_cvtsi128_si32(hi);
+  avg[3] = _mm_extract_epi32(hi, 2);
+}
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
+
+  if (width % 32 == 0) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int wd = 0; wd < width; wd += 32) {
+      const uint8_t *ref_tmp = ref + wd;
+      int16_t *hbuf_tmp = hbuf + wd;
+      __m256i s0 = zero;
+      __m256i s1 = zero;
+      int idx = 0;
+      do {
+        __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+        __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_add_epi16(s0, t0);
+        s1 = _mm256_add_epi16(s1, t1);
+        ref_tmp += ref_stride;
+
+        src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+        t0 = _mm256_unpacklo_epi8(src_line, zero);
+        t1 = _mm256_unpackhi_epi8(src_line, zero);
+        s0 = _mm256_add_epi16(s0, t0);
+        s1 = _mm256_add_epi16(s1, t1);
+        ref_tmp += ref_stride;
+        idx += 2;
+      } while (idx < height);
+      s0 = _mm256_srai_epi16(s0, norm_factor);
+      s1 = _mm256_srai_epi16(s1, norm_factor);
+      _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+                       _mm256_extractf128_si256(s0, 1));
+      _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+                       _mm256_extractf128_si256(s1, 1));
+    }
+  } else if (width % 16 == 0) {
+    aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+  }
+}
+
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+                                     const int stride) {
+  src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+  src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+  src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE                                                \
+  /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */                     \
+  const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0);         \
+  /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */             \
+  const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01);       \
+  /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */     \
+  const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012);   \
+                                                                              \
+  const __m128i results0 = _mm_add_epi16(                                     \
+      _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+  const __m128i results1 =                                                    \
+      _mm_add_epi16(results0, _mm_srli_si128(results0, 8));                   \
+  _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+                                             const int ref_stride,
+                                             const int height,
+                                             int norm_factor) {
+  const __m256i zero = _mm256_setzero_si256();
+  int ht = 0;
+  // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+  // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+  // utilize the higher 16-bits efficiently.
+  do {
+    __m256i src_00 =
+        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+    src_00 = _mm256_inserti128_si256(
+        src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+    __m256i src_01 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+    src_01 = _mm256_inserti128_si256(
+        src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+    __m256i src_10 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+    src_10 = _mm256_inserti128_si256(
+        src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+    __m256i src_11 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+    src_11 = _mm256_inserti128_si256(
+        src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+    // s00 x x x s01 x x x | s40 x x x s41 x x x
+    const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+    // s10 x x x s11 x x x | s50 x x x s51 x x x
+    const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+    // s20 x x x s21 x x x | s60 x x x s61 x x x
+    const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+    // s30 x x x s31 x x x | s70 x x x s71 x x x
+    const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+    // s00 s10 x x x x x x | s40 s50 x x x x x x
+    const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+    // s01 s11 x x x x x x | s41 s51 x x x x x x
+    const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+    // s20 s30 x x x x x x | s60 s70 x x x x x x
+    const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+    // s21 s31 x x x x x x | s61 s71 x x x x x x
+    const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+    // s0 s1 x x x x x x | s4 s5 x x x x x x
+    const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+    // s2 s3 x x x x x x | s6 s7 x x x x x x
+    const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+    // s1 s1 s2 s3 s4 s5 s6 s7
+    const __m128i results = _mm256_castsi256_si128(
+        _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+    _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+    vbuf += 8;
+    ref += (ref_stride << 3);
+    ht += 8;
+  } while (ht < height);
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  assert(width % 16 == 0);
+  if (width == 128) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[16];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
+      load_from_src_buf(ref + 64, &src[8], ref_stride);
+      load_from_src_buf(ref + 96, &src[12], ref_stride);
+
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+                                          _mm256_sad_epu8(src[4], zero));
+      const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+                                          _mm256_sad_epu8(src[12], zero));
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+                                          _mm256_sad_epu8(src[5], zero));
+      const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+                                          _mm256_sad_epu8(src[13], zero));
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+                                          _mm256_sad_epu8(src[6], zero));
+      const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+                                          _mm256_sad_epu8(src[14], zero));
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+                                          _mm256_sad_epu8(src[7], zero));
+      const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+                                          _mm256_sad_epu8(src[15], zero));
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 64) {
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[8];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
+
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+      const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+      const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+      const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+      const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 32) {
+    assert(height % 2 == 0);
+    const __m256i zero = _mm256_setzero_si256();
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[4];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+
+      // s00 x x x s01 x x x s02 x x x s03 x x x
+      const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+      // s10 x x x s11 x x x s12 x x x s13 x x x
+      const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+      // s20 x x x s21 x x x s22 x x x s23 x x x
+      const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+      // s30 x x x s31 x x x s32 x x x s33 x x x
+      const __m256i r3 = _mm256_sad_epu8(src[3], zero);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
+    }
+  } else if (width == 16) {
+    aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
+  }
+}
+
+static inline void calc_vector_mean_sse_64wd(const int16_t *ref,
+                                             const int16_t *src, __m256i *mean,
+                                             __m256i *sse) {
+  const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+  const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+  const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+  const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+  const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+
+  const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+  const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+  const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2);
+  const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3);
+  const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+  const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2);
+  const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3);
+
+  *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1));
+  *mean = _mm256_add_epi16(*mean, diff2);
+  *mean = _mm256_add_epi16(*mean, diff3);
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1));
+  *sse = _mm256_add_epi32(*sse, diff_sqr2);
+  *sse = _mm256_add_epi32(*sse, diff_sqr3);
+}
+
+#define CALC_VAR_FROM_MEAN_SSE(mean, sse)                                    \
+  {                                                                          \
+    mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1));                    \
+    mean = _mm256_hadd_epi32(mean, sse);                                     \
+    mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4));             \
+    const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean),       \
+                                         _mm256_extractf128_si256(mean, 1)); \
+    /*(mean * mean): dynamic range 31 bits.*/                                \
+    const int mean_int = _mm_extract_epi32(result, 0);                       \
+    const int sse_int = _mm_extract_epi32(result, 2);                        \
+    const unsigned int mean_abs = abs(mean_int);                             \
+    var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));                    \
+  }
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) {
+  const int width = 4 << bwl;
+  assert(width % 16 == 0 && width <= 128);
+  int var = 0;
+
+  // Instead of having a loop over width 16, considered loop unrolling to avoid
+  // some addition operations.
+  if (width == 128) {
+    __m256i mean = _mm256_setzero_si256();
+    __m256i sse = _mm256_setzero_si256();
+
+    calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+    calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse);
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 64) {
+    __m256i mean = _mm256_setzero_si256();
+    __m256i sse = _mm256_setzero_si256();
+
+    calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 32) {
+    const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+    const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+
+    const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+    const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+    const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+    const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+    const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1);
+    __m256i mean = _mm256_add_epi16(diff0, diff1);
+
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  } else if (width == 16) {
+    const __m256i src_line = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref);
+    __m256i mean = _mm256_sub_epi16(ref_line, src_line);
+    const __m256i sse = _mm256_madd_epi16(mean, mean);
+
+    CALC_VAR_FROM_MEAN_SSE(mean, sse)
+  }
+  return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..9ab9143eee
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+                                                   __m128i *out_lo,
+                                                   __m128i *out_hi) {
+  const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+  *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+  *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0 = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = loadh_epi64((const __m128i *)(s + p),
+                   _mm_loadl_epi64((const __m128i *)(s)));
+  s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+  s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+  s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s2 = _mm_sad_epu8(s2, u0);
+  s3 = _mm_sad_epu8(s3, u0);
+
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  sum0 = _mm_add_epi16(sum0, sum1);
+  sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+  avg = _mm_cvtsi128_si32(sum0);
+  return (avg + 32) >> 6;
+}
+
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+  sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+  sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+  sum0 = _mm_add_epi16(sum0, sum1);
+
+  // (avg + 32) >> 6
+  __m128i rounding = _mm_set1_epi32(32);
+  sum0 = _mm_add_epi32(sum0, rounding);
+  sum0 = _mm_srli_epi32(sum0, 6);
+  avg[0] = _mm_cvtsi128_si32(sum0);
+  avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
+void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+                           int *avg) {
+  const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+  for (int k = 0; k < 2; k++) {
+    calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+    s_ptr += 8 * p;
+  }
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+  __m128i s0, s1, u0;
+  unsigned int avg = 0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p)));
+  s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s0 = _mm_add_epi16(s0, s1);
+  avg = _mm_cvtsi128_si32(s0);
+  return (avg + 8) >> 4;
+}
+
+static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
+  const __m128i a0 = in[0];
+  const __m128i a1 = in[1];
+  const __m128i a2 = in[2];
+  const __m128i a3 = in[3];
+  const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
+  const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
+  const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
+  const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
+  in[0] = _mm_add_epi16(b0, b2);
+  in[1] = _mm_add_epi16(b1, b3);
+  in[2] = _mm_sub_epi16(b0, b2);
+  in[3] = _mm_sub_epi16(b1, b3);
+
+  if (iter == 0) {
+    const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+    const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+    const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+    const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+    in[0] = dcba_lo;
+    in[1] = _mm_srli_si128(dcba_lo, 8);
+    in[2] = dcba_hi;
+    in[3] = _mm_srli_si128(dcba_hi, 8);
+  }
+}
+
+void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  __m128i src[4];
+  src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
+  src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride));
+
+  hadamard_col4_sse2(src, 0);
+  hadamard_col4_sse2(src, 1);
+
+  store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
+  coeff += 8;
+  store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
+}
+
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+                                     ptrdiff_t src_stride, tran_low_t *coeff,
+                                     int is_final) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  if (is_final) {
+    store_tran_low(src[0], coeff);
+    coeff += 8;
+    store_tran_low(src[1], coeff);
+    coeff += 8;
+    store_tran_low(src[2], coeff);
+    coeff += 8;
+    store_tran_low(src[3], coeff);
+    coeff += 8;
+    store_tran_low(src[4], coeff);
+    coeff += 8;
+    store_tran_low(src[5], coeff);
+    coeff += 8;
+    store_tran_low(src[6], coeff);
+    coeff += 8;
+    store_tran_low(src[7], coeff);
+  } else {
+    int16_t *coeff16 = (int16_t *)coeff;
+    _mm_store_si128((__m128i *)coeff16, src[0]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[1]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[2]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[3]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[4]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[5]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[6]);
+    coeff16 += 8;
+    _mm_store_si128((__m128i *)coeff16, src[7]);
+  }
+}
+
+void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                           tran_low_t *coeff) {
+  hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+                                        ptrdiff_t src_stride, int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                              int16_t *coeff) {
+  hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff,
+                                   ptrdiff_t src_stride, int16_t *coeff) {
+  for (int i = 0; i < 2; i++) {
+    hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
+  }
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                                int16_t *coeff) {
+  for (int idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  int16_t *t_coeff = coeff;
+  for (int idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    _mm_store_si128((__m128i *)t_coeff, coeff0);
+    _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+    _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+    t_coeff += 8;
+  }
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+                                       ptrdiff_t src_stride, tran_low_t *coeff,
+                                       int is_final) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+  int16_t *t_coeff = temp_coeff;
+  int16_t *coeff16 = (int16_t *)coeff;
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+                      0);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    b0 = _mm_srai_epi16(b0, 1);
+    b1 = _mm_srai_epi16(b1, 1);
+    b2 = _mm_srai_epi16(b2, 1);
+    b3 = _mm_srai_epi16(b3, 1);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+
+    if (is_final) {
+      store_tran_low_offset_4(coeff0, coeff);
+      store_tran_low_offset_4(coeff1, coeff + 64);
+      store_tran_low_offset_4(coeff2, coeff + 128);
+      store_tran_low_offset_4(coeff3, coeff + 192);
+      coeff += 4;
+    } else {
+      _mm_store_si128((__m128i *)coeff16, coeff0);
+      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+      coeff16 += 8;
+    }
+
+    t_coeff += 8;
+    // Increment the pointer additionally by 0 and 8 in alternate
+    // iterations(instead of 8) to ensure the coherency with the implementation
+    // of store_tran_low_offset_4()
+    coeff += (((idx >> 3) & 1) << 3);
+  }
+}
+
+void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+                             tran_low_t *coeff) {
+  // For high bitdepths, it is unnecessary to store_tran_low
+  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+  // next stage.  Output to an intermediate buffer first, then store_tran_low()
+  // in the final stage.
+  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+  int16_t *t_coeff = temp_coeff;
+  int idx;
+  __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+      b3_lo;
+  __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+      b3_hi;
+  __m128i b0, b1, b2, b3;
+  const __m128i zero = _mm_setzero_si128();
+  for (idx = 0; idx < 4; ++idx) {
+    const int16_t *src_ptr =
+        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+    hadamard_16x16_sse2(src_ptr, src_stride,
+                        (tran_low_t *)(t_coeff + idx * 256), 0);
+  }
+
+  for (idx = 0; idx < 256; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+    // Sign extend 16 bit to 32 bit.
+    sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+    sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+    b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+    b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+    b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+    b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+    b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+    b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+    b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+    b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+    b0_lo = _mm_srai_epi32(b0_lo, 2);
+    b1_lo = _mm_srai_epi32(b1_lo, 2);
+    b2_lo = _mm_srai_epi32(b2_lo, 2);
+    b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+    b0_hi = _mm_srai_epi32(b0_hi, 2);
+    b1_hi = _mm_srai_epi32(b1_hi, 2);
+    b2_hi = _mm_srai_epi32(b2_hi, 2);
+    b3_hi = _mm_srai_epi32(b3_hi, 2);
+
+    b0 = _mm_packs_epi32(b0_lo, b0_hi);
+    b1 = _mm_packs_epi32(b1_lo, b1_hi);
+    b2 = _mm_packs_epi32(b2_lo, b2_hi);
+    b3 = _mm_packs_epi32(b3_lo, b3_hi);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    store_tran_low_offset_4(coeff0, coeff);
+    store_tran_low_offset_4(coeff1, coeff + 256);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    store_tran_low_offset_4(coeff2, coeff + 512);
+    store_tran_low_offset_4(coeff3, coeff + 768);
+
+    // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
+    // to ensure the coherency with the implementation of
+    // store_tran_low_offset_4()
+    coeff += (4 + (((idx >> 3) & 1) << 3));
+    t_coeff += 8;
+  }
+}
+
+int aom_satd_sse2(const tran_low_t *coeff, int length) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i accum = zero;
+
+  for (i = 0; i < length; i += 4) {
+    const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
+    const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
+    accum = _mm_add_epi32(accum, abs_coeff);
+    coeff += 4;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i accum = zero;
+
+  for (int i = 0; i < length; i += 16) {
+    const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+    const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+    const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+    const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+    const __m128i abs0 = _mm_max_epi16(src_line0, inv0);  // abs(src_line)
+    const __m128i abs1 = _mm_max_epi16(src_line1, inv1);  // abs(src_line)
+    const __m128i sum0 = _mm_madd_epi16(abs0, one);
+    const __m128i sum1 = _mm_madd_epi16(abs1, one);
+    accum = _mm_add_epi32(accum, sum0);
+    accum = _mm_add_epi32(accum, sum1);
+    coeff += 16;
+  }
+
+  {  // cascading summation of accum
+    __m128i hi = _mm_srli_si128(accum, 8);
+    accum = _mm_add_epi32(accum, hi);
+    hi = _mm_srli_epi64(accum, 32);
+    accum = _mm_add_epi32(accum, hi);
+  }
+
+  return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width and height to be multiple of 16 and 2
+  // respectively. For any odd width or height, SIMD support needs to be added.
+  assert(width % 16 == 0 && height % 2 == 0);
+  __m128i zero = _mm_setzero_si128();
+
+  for (int wd = 0; wd < width; wd += 16) {
+    const uint8_t *ref_tmp = ref + wd;
+    int16_t *hbuf_tmp = hbuf + wd;
+    __m128i s0 = zero;
+    __m128i s1 = zero;
+    int idx = 0;
+    do {
+      __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+      __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_add_epi16(s0, t0);
+      s1 = _mm_add_epi16(s1, t1);
+      ref_tmp += ref_stride;
+
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      t0 = _mm_unpacklo_epi8(src_line, zero);
+      t1 = _mm_unpackhi_epi8(src_line, zero);
+      s0 = _mm_add_epi16(s0, t0);
+      s1 = _mm_add_epi16(s1, t1);
+      ref_tmp += ref_stride;
+      idx += 2;
+    } while (idx < height);
+
+    s0 = _mm_srai_epi16(s0, norm_factor);
+    s1 = _mm_srai_epi16(s1, norm_factor);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+    _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
+  }
+}
+
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+                          const int ref_stride, const int width,
+                          const int height, int norm_factor) {
+  // SIMD implementation assumes width to be multiple of 16.
+  assert(width % 16 == 0);
+
+  for (int ht = 0; ht < height; ht++) {
+    const uint8_t *ref_tmp = ref + (ht * ref_stride);
+    __m128i zero = _mm_setzero_si128();
+    __m128i s0 = zero;
+    __m128i s1, src_line;
+    for (int i = 0; i < width; i += 16) {
+      src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+      s1 = _mm_sad_epu8(src_line, zero);
+      s0 = _mm_add_epi16(s0, s1);
+      ref_tmp += 16;
+    }
+
+    s1 = _mm_srli_si128(s0, 8);
+    s0 = _mm_add_epi16(s0, s1);
+    vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 0000000000..b83b43122a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) {
+  const int width = 4 << bwl;
+  assert(width % 16 == 0);
+
+  const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+  __m128i mean = _mm_setzero_si128();
+  __m128i sse = _mm_setzero_si128();
+
+  for (int i = 0; i < width; i += 16) {
+    const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+    const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+    const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+    __m128i diff = _mm_sub_epi16(ref_line, src_line);
+    const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+    __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+    const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+    diff = _mm_add_epi16(diff, diff2);
+    diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+    sse = _mm_add_epi32(sse, diff_sqr);
+    mean = _mm_add_epi16(mean, diff);
+
+    src += 16;
+    ref += 16;
+  }
+
+  // m0 m1 m2 m3
+  mean = _mm_madd_epi16(mean, k_one_epi16);
+  // m0+m1 m2+m3 s0+s1 s2+s3
+  __m128i result = _mm_hadd_epi32(mean, sse);
+  // m0+m1+m2+m3 s0+s1+s2+s3 x x
+  result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4));
+
+  // (mean * mean): dynamic range 31 bits.
+  const int mean_int = _mm_extract_epi32(result, 0);
+  const int sse_int = _mm_extract_epi32(result, 2);
+  const unsigned int mean_abs = abs(mean_int);
+  const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));
+  return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..85896e2768
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+  const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+  const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+  return _mm256_packs_epi32(a_low, a_high);
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+  const __m256i a_lo = _mm256_mullo_epi16(a, one);
+  const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+  const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+  _mm256_storeu_si256((__m256i *)b, a_1);
+  _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..ff77760b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i a_hi = _mm_mulhi_epi16(a, one);
+  const __m128i a_lo = _mm_mullo_epi16(a, one);
+  *a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+  *a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+  __m128i a_1, a_2;
+  unpack_trans(a, &a_1, &a_2);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+// Stores the second result at an offset of 8 (instead of 4) to match the output
+// with that of AVX2 implementation and the function is similar to
+// store_tran_low().
+static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
+  __m128i a_1, a_2;
+  unpack_trans(a, &a_1, &a_2);
+  _mm_store_si128((__m128i *)(b), a_1);
+  _mm_store_si128((__m128i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000000..e0289abe12
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, 0, w, h, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+                                   src1_8, src1_stride, mask, 0, w, h, 0, 0,
+                                   bd);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 0000000000..dfbab324d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,1374 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+#include <immintrin.h>  // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+    int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  __m256i res = _mm256_packus_epi16(res0, res0);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+    const __m256i *v_maxval, int shift) {
+  const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+  const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+  const __m256i s0_0 = yy_loadu_256(src0);
+  const __m256i s0_1 = yy_loadu_256(src0 + 16);
+  const __m256i s1_0 = yy_loadu_256(src1);
+  const __m256i s1_1 = yy_loadu_256(src1 + 16);
+  __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+                                      _mm256_unpacklo_epi16(*m0, max_minus_m0));
+  __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+                                      _mm256_unpackhi_epi16(*m0, max_minus_m0));
+  __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+                                      _mm256_unpacklo_epi16(*m1, max_minus_m1));
+  __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+                                      _mm256_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi =
+      _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+  const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+  __m256i res = _mm256_packus_epi16(res0, res1);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m = xx_loadu_128(mask);
+    const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m = yy_loadu_256(mask + j);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+      const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m256i m_i00 = yy_loadu_256(mask);
+    const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+    const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+    const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+    const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+    blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+      const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+      const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+      const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+      const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+      const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+      const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+      const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+      const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+      const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+      const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+      const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+      const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+      blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m256i *round_offset, int shift) {
+  const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i zeros = _mm256_setzero_si256();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 32) {
+      const __m256i m_i00 = yy_loadu_256(mask + j);
+      const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+      const __m256i m_ac =
+          _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+      const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+      const __m256i m1 =
+          _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+      blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                  round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 16:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &y_round_offset, shift);
+        break;
+    }
+  }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+  const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+  const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+  const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+  const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+  return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+                                       const __m256i *v_m0_b,
+                                       const __m256i *v_m1_b,
+                                       const int32_t bits) {
+  const __m256i v_s0_b = yy_loadu_256(src0);
+  const __m256i v_s1_b = yy_loadu_256(src1);
+
+  const __m256i v_p0_w =
+      _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m256i v_p1_w =
+      _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+                           _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+  const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+  const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m256i v_ral_b = yy_loadu_256(mask);
+    const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+    const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+    const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+    const __m256i v_rvsbl_w =
+        _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+    const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+    const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+      const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+      const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+      const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+      const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+      const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+      const __m256i v_rvsbl_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+      const __m256i v_rvsbh_w =
+          _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+      const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+      const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadu_128(mask);
+        const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+        const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+        const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+        const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+        const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+        const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+        const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+        const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+  do {
+    const __m256i v_rl_b = yy_loadu_256(mask);
+    const __m256i v_al_b =
+        _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+    const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+    const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+    const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+                                             AOM_BLEND_A64_ROUND_BITS);
+
+    xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+      const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+      const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m256i v_al_b =
+          _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+      const __m256i v_ah_b =
+          _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+      const __m256i v_m0_b =
+          _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_r_b = xx_loadl_64(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_r_b = xx_loadu_128(mask);
+        const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+        const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+        const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+      break;
+  }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storeu_128(dst, v_res_b);
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_ra_b = yy_loadu_256(mask + c);
+      const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+      const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_ra_b = xx_loadl_32(mask);
+        const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_ra_b = xx_loadl_64(mask);
+        const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+        const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += 2 * mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h);
+      break;
+    default:
+      blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  do {
+    int c;
+    for (c = 0; c < w; c += 32) {
+      const __m256i v_m0_b = yy_loadu_256(mask + c);
+      const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m256i v_res_b = blend_32_u8_avx2(
+          src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+      yy_storeu_256(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  switch (w) {
+    case 4:
+      do {
+        const __m128i v_m0_b = xx_loadl_32(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_32(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 8:
+      do {
+        const __m128i v_m0_b = xx_loadl_64(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storel_64(dst, v_res_b);
+
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    case 16:
+      do {
+        const __m128i v_m0_b = xx_loadu_128(mask);
+        const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+        const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+        xx_storeu_128(dst, v_res_b);
+        dst += dst_stride;
+        src0 += src0_stride;
+        src1 += src1_stride;
+        mask += mask_stride;
+      } while (--h);
+      break;
+    default:
+      blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h);
+  }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+                             const uint8_t *src0, uint32_t src0_stride,
+                             const uint8_t *src1, uint32_t src1_stride,
+                             const uint8_t *mask, uint32_t mask_stride, int w,
+                             int h, int subw, int subh) {
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subw, subh);
+  } else {
+    if (subw & subh) {
+      blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, mask_stride, w, h);
+    } else if (subw) {
+      blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else if (subh) {
+      blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+                             src1_stride, mask, mask_stride, w, h);
+    } else {
+      blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                          mask, mask_stride, w, h);
+    }
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_avx2()
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  // Load 4x u16 pixels from each of 4 rows from each source
+  const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
+                                       *(int64_t *)(src0 + 2 * src0_stride),
+                                       *(int64_t *)(src0 + 1 * src0_stride),
+                                       *(int64_t *)(src0 + 0 * src0_stride));
+  const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
+                                       *(int64_t *)(src1 + 2 * src1_stride),
+                                       *(int64_t *)(src1 + 1 * src1_stride),
+                                       *(int64_t *)(src1 + 0 * src1_stride));
+  // Generate the inverse mask
+  const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
+
+  // Multiply each mask by the respective source
+  const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
+  const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
+  const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
+  const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
+  const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
+  const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
+
+  const __m256i roundh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
+  const __m256i roundl =
+      _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
+
+  const __m256i pack = _mm256_packs_epi32(roundl, roundh);
+  const __m256i clip =
+      _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
+
+  // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
+  const __m128i cliph = _mm256_extracti128_si256(clip, 1);
+  xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
+  xx_storel_64(dst + 2 * dst_stride, cliph);
+  const __m128i clipl = _mm256_castsi256_si128(clip);
+  xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
+  xx_storel_64(dst + 0 * dst_stride, clipl);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
+    const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride),
+                                         *(int32_t *)(mask + 2 * mask_stride),
+                                         *(int32_t *)(mask + 1 * mask_stride),
+                                         *(int32_t *)(mask + 0 * mask_stride));
+    const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m256i *round_offset, int shift, const __m256i *clip_low,
+    const __m256i *clip_high, const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m0246 =
+        _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
+                          *(int64_t *)(mask + 4 * mask_stride),
+                          *(int64_t *)(mask + 2 * mask_stride),
+                          *(int64_t *)(mask + 0 * mask_stride));
+    const __m256i m1357 =
+        _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
+                          *(int64_t *)(mask + 5 * mask_stride),
+                          *(int64_t *)(mask + 3 * mask_stride),
+                          *(int64_t *)(mask + 1 * mask_stride));
+    const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
+    const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
+    const __m256i mask0 =
+        _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+                                      src1_stride, &mask0, round_offset, shift,
+                                      clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 8x u16 pixels from each of 4 rows from each source
+  const __m256i s0a =
+      yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
+  const __m256i s0b =
+      yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+  const __m256i s1a =
+      yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
+  const __m256i s1b =
+      yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
+
+  // Generate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  // Divide down each result, with rounding
+  const __m256i roundah =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
+  const __m256i roundal =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
+  const __m256i roundbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
+  const __m256i roundbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
+
+  // Pack each i32 down to an i16 with saturation, then clip to valid range
+  const __m256i packa = _mm256_packs_epi32(roundal, roundah);
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 8x u16 pixels to each of 4 rows in the destination
+  yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
+  yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  do {
+    // Load 8x u8 pixels from each of 4 rows in the mask
+    const __m128i mask0a8 =
+        _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
+                       *(int64_t *)(mask + 3 * mask_stride));
+    const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
+    const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  do {
+    // Load 16x u8 pixels from each of 8 rows in the mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m256i m02 =
+        yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
+    const __m256i m13 =
+        yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
+    const __m256i m0123 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
+    const __m256i mask_0a =
+        _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
+    const __m256i m46 =
+        yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
+    const __m256i m57 =
+        yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
+    const __m256i m4567 =
+        _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
+    const __m256i mask_0b =
+        _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_avx2(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+    const __m256i *mask0b, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  // Load 16x pixels from each of 2 rows from each source
+  const __m256i s0a = yy_loadu_256(src0);
+  const __m256i s0b = yy_loadu_256(src0 + src0_stride);
+  const __m256i s1a = yy_loadu_256(src1);
+  const __m256i s1b = yy_loadu_256(src1 + src1_stride);
+
+  // Calculate inverse masks
+  const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+  const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each source by appropriate mask
+  const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+  const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+  const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+  // lanes Later, packs does the same again which cancels this out with no need
+  // for a permute.  The intermediate values being reordered makes no difference
+
+  const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+  const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+  const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
+  const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
+
+  const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+  const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+  const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+  const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+  const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
+  const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+  const __m256i resah =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
+  const __m256i resal =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
+  const __m256i resbh =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
+  const __m256i resbl =
+      _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m256i packa = _mm256_packs_epi32(resal, resah);
+  const __m256i packb = _mm256_packs_epi32(resbl, resbh);
+
+  // Clip the values to the valid range
+  const __m256i clipa =
+      _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+  const __m256i clipb =
+      _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  yy_storeu_256(dst, clipa);
+  yy_storeu_256(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values from each of two rows and pad to u16
+      const __m128i masks_a8 = xx_loadu_128(mask + j);
+      const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
+      const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
+      const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+    int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+    const __m256i *clip_low, const __m256i *clip_high,
+    const __m256i *mask_max) {
+  const __m256i one_b = _mm256_set1_epi8(1);
+  const __m256i two_w = _mm256_set1_epi16(2);
+  for (int i = 0; i < h; i += 2) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 32x u8 alpha-mask values from each of four rows
+      // (saturating) add pairs of rows, then use madd to add adjacent values
+      // Finally, divide down each result with rounding
+      const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
+      const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
+      const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
+      const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
+
+      const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
+      const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
+
+      const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
+      const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
+
+      const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
+      const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_avx2(
+          dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+          &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_avx2(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m256i clip_low = _mm256_setzero_si256();
+  const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
+  const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >= 16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 0000000000..58a7345ec2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                     const uint8_t *src0, uint32_t src0_stride,
+                                     const uint8_t *src1, uint32_t src1_stride,
+                                     const uint8_t *mask, uint32_t mask_stride,
+                                     int w, int h) {
+  (void)w;
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_m0_b = xx_loadl_64(mask);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0_b = xx_loadu_128(mask + c);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_r_b = xx_loadu_128(mask);
+    const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+    const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+    const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+      const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+      const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+      const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+      const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+    const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+    const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+    const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+    const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+    const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+    uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+      const __m128i v_rvsbh_w =
+          _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+      const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+      const __m128i v_res_b =
+          blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                               const uint8_t *src0, uint32_t src0_stride,
+                               const uint8_t *src1, uint32_t src1_stride,
+                               const uint8_t *mask, uint32_t mask_stride, int w,
+                               int h, int subw, int subh) {
+  typedef void (*blend_fn)(
+      uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+      uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
+    { // w % 16 == 0
+      { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+      { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+    { // w == 4
+      { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+      { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+    { // w == 8
+      { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+      { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+                         mask, mask_stride, w, h, subw, subh);
+  } else {
+    blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
+                                              src0_stride, src1, src1_stride,
+                                              mask, mask_stride, w, h);
+  }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, w, h,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, w, h,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w =
+        _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h,
+    blend_unit_fn blend) {
+  const __m128i v_zmask_b =
+      _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w =
+          _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, w, h,
+                                     blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                      const uint8_t *src0_8,
+                                      uint32_t src0_stride,
+                                      const uint8_t *src1_8,
+                                      uint32_t src1_stride, const uint8_t *mask,
+                                      uint32_t mask_stride, int w, int h,
+                                      int subw, int subh, int bd) {
+  typedef void (*blend_fn)(
+      uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+      uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+      const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+  // Dimensions are: bd_index X width_index X subw X subh
+  static const blend_fn blend[2][2][2][2] = {
+    {   // bd == 8 or 10
+      { // w % 8 == 0
+        { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+        { blend_a64_mask_b10_sx_w8n_sse4_1,
+          blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+        { blend_a64_mask_b10_sx_w4_sse4_1,
+          blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+    {   // bd == 12
+      { // w % 8 == 0
+        { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+        { blend_a64_mask_b12_sx_w8n_sse4_1,
+          blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+      { // w == 4
+        { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+        { blend_a64_mask_b12_sx_w4_sse4_1,
+          blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                src1_stride, mask, mask_stride, w, h, subw,
+                                subh, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+        mask_stride, w, h);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void blend_a64_d16_mask_w16_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+    const __m128i *v_maxval, int shift) {
+  const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+  const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+  const __m128i s0_0 = xx_loadu_128(src0);
+  const __m128i s0_1 = xx_loadu_128(src0 + 8);
+  const __m128i s1_0 = xx_loadu_128(src1);
+  const __m128i s1_1 = xx_loadu_128(src1 + 8);
+  __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+                                   _mm_unpacklo_epi16(*m0, max_minus_m0));
+  __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+                                   _mm_unpackhi_epi16(*m0, max_minus_m0));
+  __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+                                   _mm_unpacklo_epi16(*m1, max_minus_m1));
+  __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+                                   _mm_unpackhi_epi16(*m1, max_minus_m1));
+  res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+  res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+  res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+  res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+  const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+  const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+  const __m128i res = _mm_packus_epi16(res0, res1);
+
+  _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m = xx_loadu_128(mask + j);
+      const __m128i m0 = _mm_cvtepu8_epi16(m);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+      const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+      const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+      const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + j);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+      const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+      const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+      const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+      blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+                                   round_offset, &v_maxval, shift);
+    }
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params) {
+  const int bd = 8;
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+  const int round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else if (subw == 1 && subh == 0) {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      case 8:
+        aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift);
+        break;
+      default:
+        lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift);
+        break;
+    }
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_sse4_1()
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 4 pixels from each of 4 rows from each source
+  const __m128i s0a =
+      _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
+  const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
+                                     *(int64_t *)(src0 + 3 * src0_stride));
+  const __m128i s1a =
+      _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
+  const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
+                                     *(int64_t *)(src1 + 3 * src1_stride));
+
+  // Generate the inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
+
+  // Multiply each mask by the respective source
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storel_64(dst, _mm_srli_si128(clipa, 8));
+  xx_storel_64(dst + dst_stride, clipa);
+  xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
+  xx_storel_64(dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  do {
+    const __m128i mask0a8 =
+        _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
+    const __m128i mask0b8 =
+        _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
+                      *(int32_t *)(mask + 3 * mask_stride));
+    const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
+    const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 4;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    // Load 8 pixels from each of 8 rows of mask,
+    // (saturating) add together rows then use madd to add adjacent pixels
+    // Finally, divide each value by 4 (with rounding)
+    const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
+                                       *(int64_t *)(mask + 2 * mask_stride));
+    const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
+                                       *(int64_t *)(mask + 3 * mask_stride));
+    const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
+    const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
+    const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
+                                       *(int64_t *)(mask + 6 * mask_stride));
+    const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
+                                       *(int64_t *)(mask + 7 * mask_stride));
+    const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
+    const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w4_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+        &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+    dst += dst_stride * 4;
+    src0 += src0_stride * 4;
+    src1 += src1_stride * 4;
+    mask += mask_stride * 8;
+  } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+    uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+    const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+    const __m128i *mask0b, const __m128i *round_offset, int shift,
+    const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *max_mask) {
+  // Load 8x pixels from each of 2 rows from each source
+  const __m128i s0a = xx_loadu_128(src0);
+  const __m128i s0b = xx_loadu_128(src0 + src0_stride);
+  const __m128i s1a = xx_loadu_128(src1);
+  const __m128i s1b = xx_loadu_128(src1 + src1_stride);
+
+  // Generate inverse masks
+  const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
+  const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
+
+  // Multiply sources by respective masks
+  const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+  const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+  const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+  const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+
+  const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+  const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+  const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+  const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+  const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+  const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+
+  const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+  const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+  const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+  const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+  const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+  const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+  const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+  const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+  const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+  const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+  const __m128i roundah =
+      _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+  const __m128i roundal =
+      _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+  const __m128i roundbh =
+      _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+  const __m128i roundbl =
+      _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+  const __m128i packa = _mm_packs_epi32(roundal, roundah);
+  const __m128i clipa =
+      _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+  const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+  const __m128i clipb =
+      _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+  xx_storeu_128(dst, clipa);
+  xx_storeu_128(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  do {
+    const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
+    const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+        round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 2;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *max_mask) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  do {
+    const __m128i mask_thisrowa = xx_loadu_128(mask);
+    const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
+    const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
+    const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
+    const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
+    const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
+    const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
+    const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
+    const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
+    const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
+
+    highbd_blend_a64_d16_mask_w8_sse4_1(
+        dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
+        &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
+
+    dst += dst_stride * 2;
+    src0 += src0_stride * 2;
+    src1 += src1_stride * 2;
+    mask += mask_stride * 4;
+  } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+    uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *round_offset, int shift, const __m128i *mask0l,
+    const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
+    const __m128i *mask_max) {
+  // Load 16x u16 pixels for this row from each src
+  const __m128i s0l = xx_loadu_128(src0);
+  const __m128i s0h = xx_loadu_128(src0 + 8);
+  const __m128i s1l = xx_loadu_128(src1);
+  const __m128i s1h = xx_loadu_128(src1 + 8);
+
+  // Calculate inverse masks
+  const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
+  const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
+
+  const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
+  const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
+  const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
+  const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
+
+  const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
+  const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
+  const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
+  const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
+
+  const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
+  const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
+
+  const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
+  const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
+  const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
+  const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
+
+  const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
+  const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
+  const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
+  const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
+
+  const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
+  const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
+
+  const __m128i reshh =
+      _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
+  const __m128i reshl =
+      _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
+  const __m128i reslh =
+      _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
+  const __m128i resll =
+      _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
+
+  // Signed saturating pack from i32 to i16:
+  const __m128i packh = _mm_packs_epi32(reshl, reshh);
+  const __m128i packl = _mm_packs_epi32(resll, reslh);
+
+  // Clip the values to the valid range
+  const __m128i cliph =
+      _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
+  const __m128i clipl =
+      _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
+
+  // Store 16 pixels
+  xx_storeu_128(dst, clipl);
+  xx_storeu_128(dst + 8, cliph);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      // Load 16x u8 alpha-mask values and pad to u16
+      const __m128i masks_u8 = xx_loadu_128(mask + j);
+      const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
+      const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h, int w,
+    const __m128i *round_offset, int shift, const __m128i *clip_low,
+    const __m128i *clip_high, const __m128i *mask_max) {
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; i++) {
+    for (int j = 0; j < w; j += 16) {
+      const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+      const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+      const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+      const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+      const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+      const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+      const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+      const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+      const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+      const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+      highbd_blend_a64_d16_mask_w16_sse4_1(
+          dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
+          clip_low, clip_high, mask_max);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride * 2;
+  }
+}
+
+void aom_highbd_blend_a64_d16_mask_sse4_1(
+    uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+    ConvolveParams *conv_params, const int bd) {
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  const int round_bits =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+  const int32_t round_offset =
+      ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+       (1 << (round_bits - 1)))
+      << AOM_BLEND_A64_ROUND_BITS;
+  const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+  const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+  const __m128i clip_low = _mm_setzero_si128();
+  const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
+  const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+
+  } else if (subw == 1 && subh == 1) {
+    switch (w) {
+      case 4:
+        highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      case 8:
+        highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+      default:  // >=16
+        highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+            dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+            mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+            &mask_max);
+        break;
+    }
+  } else {
+    // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+    // back to the vanilla C implementation instead of having all the optimised
+    // code for these.
+    aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, w, h, subw,
+                                    subh, conv_params, bd);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000000..75fb1c5a94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                      const uint8_t *src0, uint32_t src0_stride,
+                                      const uint8_t *src1, uint32_t src1_stride,
+                                      const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                        const uint8_t *src0,
+                                        uint32_t src0_stride,
+                                        const uint8_t *src1,
+                                        uint32_t src1_stride,
+                                        const uint8_t *mask, int w, int h) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
+      const __m128i v_resh_w =
+          blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                                const uint8_t *src0, uint32_t src0_stride,
+                                const uint8_t *src1, uint32_t src1_stride,
+                                const uint8_t *mask, int w, int h) {
+  typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+                           const uint8_t *src0, uint32_t src0_stride,
+                           const uint8_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    aom_blend_a64_vmask_c,        // w == 1
+    aom_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+                 h);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                          const uint16_t *src0,
+                                          uint32_t src0_stride,
+                                          const uint16_t *src1,
+                                          uint32_t src1_stride,
+                                          const uint8_t *mask, int w, int h) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+    uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+                                           const uint16_t *src0,
+                                           uint32_t src0_stride,
+                                           const uint16_t *src1,
+                                           uint32_t src1_stride,
+                                           const uint8_t *mask, int w, int h) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, w, h, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+    uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int w, int h, int bd) {
+  typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+                           const uint16_t *src0, uint32_t src0_stride,
+                           const uint16_t *src1, uint32_t src1_stride,
+                           const uint8_t *mask, int w, int h);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {
+        // bd == 8 or 10
+        blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    },
+    {
+        // bd == 12
+        blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+        blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+                                 src1_stride, mask, w, h, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, w, h);
+  }
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 0000000000..c071fdcfc4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadl_64(src0);
+  const __m128i s1 = xx_loadl_64(src1);
+  const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+  const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+  const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+  const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+  const __m128i res_d = _mm_srai_epi32(res_c, shift);
+  const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+    uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+    const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+    int shift) {
+  const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+  const __m128i s0 = xx_loadu_128(src0);
+  const __m128i s1 = xx_loadu_128(src1);
+  __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+                                  _mm_unpacklo_epi16(*m, max_minus_m));
+  __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+                                  _mm_unpackhi_epi16(*m, max_minus_m));
+  res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+  res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+  const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+  const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+  _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_32(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m0 = xx_loadl_64(mask);
+    const __m128i m = _mm_cvtepu8_epi16(m0);
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i two_w = _mm_set1_epi16(2);
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+    const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+    const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i one_b = _mm_set1_epi8(1);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadu_128(mask);
+    const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+    const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+    uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride, int h,
+    const __m128i *round_offset, int shift) {
+  const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i zeros = _mm_setzero_si128();
+  for (int i = 0; i < h; ++i) {
+    const __m128i m_i0 = xx_loadl_64(mask);
+    const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+    const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+    const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+    blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+                                shift);
+    mask += mask_stride << 1;
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+  }
+}
+#endif  // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000000..8d9b325101
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+  0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i *v_m0_w, const __m128i *v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+                                 const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                 const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+  return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+                                  const __m128i *v_m0_b, const __m128i *v_m1_b,
+                                  const __m128i *rounding) {
+  const __m128i v_s0_b = xx_loadu_128(src0);
+  const __m128i v_s1_b = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+  const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+                                           _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+  const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+  const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+  const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+  return v_res;
+}
+
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d =
+      _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d =
+      _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d =
+      _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+#endif  // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 0000000000..fdf7de3f4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+                                      int *x_sum, int64_t *x2_sum) {
+  __m256i sum_buffer, sse_buffer;
+  __m128i out_buffer;
+
+  // Accumulate the various elements of register into first element.
+  sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+  regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+  sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+  regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+  out_buffer = _mm256_castsi256_si128(regx_sum);
+  *x_sum += _mm_cvtsi128_si32(out_buffer);
+  out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if AOM_ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i row1, row2, row3;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 2); ++j) {
+    // Load 4 rows at a time.
+    row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    row1 = _mm_unpacklo_epi64(row1, row2);
+    row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+    row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+    row2 = _mm_unpacklo_epi64(row2, row3);
+    load_pixels =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 4 * stride;
+  }
+
+  // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  __m128i load_128bit, load_next_128bit;
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows at a time.
+    load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+    load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+    load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+                                          load_next_128bit, 1);
+
+    row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+    row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+    sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+    sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+    data_tmp += 2 * stride;
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+                                     int *x_sum, int64_t *x2_sum,
+                                     int loop_count) {
+  __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+      temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+  const int16_t *data_tmp = data;
+  __m256i one = _mm256_set1_epi16(1);
+  regx_sum = _mm256_setzero_si256();
+  regx2_sum = regx_sum;
+  sum_buffer = _mm256_setzero_si256();
+  sse_buffer = sum_buffer;
+
+  for (int i = 0; i < loop_count; ++i) {
+    data_tmp = data + 16 * i;
+    for (int j = 0; j < bh; ++j) {
+      load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+      row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+      row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+      sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+      sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+      data_tmp += stride;
+    }
+  }
+
+  temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+  temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+  sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+  regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+  regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+  accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+        // For smaller block widths, compute multiple rows simultaneously.
+      case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+      case 16:
+      case 32:
+        sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        break;
+      case 64:
+        // 32-bit variables will overflow for 64 rows at a single time, so
+        // compute 32 rows at a time.
+        if (bh <= 32) {
+          sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+        } else {
+          sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+          sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                            bw >> 4);
+        }
+        break;
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 0000000000..bf89427872
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum) {
+  const int16_t *data_tmp = data;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+
+  for (int j = 0; j < (bh >> 1); ++j) {
+    // Load 2 rows (8 pixels) at a time.
+    load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+    load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+    load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+    sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+    sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+    regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+    regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+    data_tmp += 2 * stride;
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum = _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+                                    int *x_sum, int64_t *x2_sum,
+                                    int loop_cycles) {
+  const int16_t *data_tmp;
+  __m128i temp_buffer1, temp_buffer2;
+  __m128i one = _mm_set1_epi16(1);
+  __m128i regx_sum = _mm_setzero_si128();
+  __m128i regx2_sum = regx_sum;
+  __m128i load_pixels, sum_buffer, sse_buffer;
+
+  for (int i = 0; i < loop_cycles; ++i) {
+    data_tmp = data + (8 * i);
+    for (int j = 0; j < bh; ++j) {
+      // Load 1 row (8-pixels) at a time.
+      load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+      sum_buffer = _mm_madd_epi16(load_pixels, one);
+      sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+      regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+      regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+      data_tmp += stride;
+    }
+  }
+
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+  regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+  *x_sum += _mm_cvtsi128_si32(regx_sum);
+  temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+  temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+  regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+  regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+  *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+    *x2_sum += tmp;
+  }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+                              int *x_sum, int64_t *x2_sum) {
+  *x_sum = 0;
+  *x2_sum = 0;
+
+  if ((bh & 3) == 0) {
+    switch (bw) {
+      case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+      case 8:
+      case 16:
+        sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+        break;
+        // For widths 32 and 64, the registers may overflow. So compute
+        // partial widths at a time.
+      case 32:
+        if (bh <= 32) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+          sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+                           bw >> 3);
+          break;
+        }
+
+      case 64:
+        if (bh <= 16) {
+          sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+          break;
+        } else {
+          for (int i = 0; i < bh; i += 16)
+            sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+                             bw >> 3);
+          break;
+        }
+
+      default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+    }
+  } else {
+    aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
new file mode 100644
index 0000000000..96fe4ebb67
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+  __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+  __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+  __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+  __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+  __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+  __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+  __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+  __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+  __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+  __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+  __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+  __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+  __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+  __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+  // 00 10 01 11 02 12 03 13  08 18 09 19 0a 1a 0b 1b
+  // 04 14 05 15 06 16 07 17  0c 1c 0d 1d 0e 1e 0f 1f
+  // 20 30 21 31 22 32 23 33  28 38 29 39 2a 3a 2b 3b
+  // 24 34 25 35 26 36 27 37  2c 3c 2d 3d 2e 3e 2f 3f
+  // 40 50 41 51 42 52 43 53  48 58 49 59 4a 5a 4b 5b
+  // 44 54 45 55 46 56 47 57  4c 5c 4d 5d 4e 5e 4f 5f
+  // 60 70 61 71 62 72 63 73  68 78 69 79 6a 7a 6b 7b
+  // 64 74 65 75 66 76 67 77  6c 7c 6d 7d 6e 7e 6f 7f
+
+  // 80 90 81 91 82 92 83 93  88 98 89 99 8a 9a 8b 9b
+  // 84 94 85 95 86 96 87 97  8c 9c 8d 9d 8e 9e 8f 9f
+  // a0 b0 a1 b1 a2 b2 a3 b3  a8 b8 a9 b9 aa ba ab bb
+  // a4 b4 a5 b5 a6 b6 a7 b7  ac bc ad bd ae be af bf
+  // c0 d0 c1 d1 c2 d2 c3 d3  c8 d8 c9 d9 ca da cb db
+  // c4 d4 c5 d5 c6 d6 c7 d7  cc dc cd dd ce de cf df
+  // e0 f0 e1 f1 e2 f2 e3 f3  e8 f8 e9 f9 ea fa eb fb
+  // e4 f4 e5 f5 e6 f6 e7 f7  ec fc ed fd ee fe ef ff
+
+  __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+  __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+  __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+  __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+  __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+  __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+  __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+  __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+  __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+  __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+  __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+  __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+  __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+  __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+  __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+  __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+  // 00 10 20 30 01 11 21 31  08 18 28 38 09 19 29 39
+  // 02 12 22 32 03 13 23 33  0a 1a 2a 3a 0b 1b 2b 3b
+  // 04 14 24 34 05 15 25 35  0c 1c 2c 3c 0d 1d 2d 3d
+  // 06 16 26 36 07 17 27 37  0e 1e 2e 3e 0f 1f 2f 3f
+  // 40 50 60 70 41 51 61 71  48 58 68 78 49 59 69 79
+  // 42 52 62 72 43 53 63 73  4a 5a 6a 7a 4b 5b 6b 7b
+  // 44 54 64 74 45 55 65 75  4c 5c 6c 7c 4d 5d 6d 7d
+  // 46 56 66 76 47 57 67 77  4e 5e 6e 7e 4f 5f 6f 7f
+
+  // 80 90 a0 b0 81 91 a1 b1  88 98 a8 b8 89 99 a9 b9
+  // 82 92 a2 b2 83 93 a3 b3  8a 9a aa ba 8b 9b ab bb
+  // 84 94 a4 b4 85 95 a5 b5  8c 9c ac bc 8d 9d ad bd
+  // 86 96 a6 b6 87 97 a7 b7  8e ae 9e be 8f 9f af bf
+  // c0 d0 e0 f0 c1 d1 e1 f1  c8 d8 e8 f8 c9 d9 e9 f9
+  // c2 d2 e2 f2 c3 d3 e3 f3  ca da ea fa cb db eb fb
+  // c4 d4 e4 f4 c5 d5 e5 f5  cc dc ef fc cd dd ed fd
+  // c6 d6 e6 f6 c7 d7 e7 f7  ce de ee fe cf df ef ff
+
+  tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+  tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+  tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+  tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+  tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+  tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+  tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+  tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+  tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+  tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+  tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+  tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+  tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+  tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+  tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+  tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+  // 00 10 20 30 40 50 60 70  08 18 28 38 48 58 68 78
+  // 01 11 21 31 41 51 61 71  09 19 29 39 49 59 69 79
+  // 02 12 22 32 42 52 62 72  0a 1a 2a 3a 4a 5a 6a 7a
+  // 03 13 23 33 43 53 63 73  0b 1b 2b 3b 4b 5b 6b 7b
+  // 04 14 24 34 44 54 64 74  0c 1c 2c 3c 4c 5c 6c 7c
+  // 05 15 25 35 45 55 65 75  0d 1d 2d 3d 4d 5d 6d 7d
+  // 06 16 26 36 46 56 66 76  0e 1e 2e 3e 4e 5e 6e 7e
+  // 07 17 27 37 47 57 67 77  0f 1f 2f 3f 4f 5f 6f 7f
+
+  // 80 90 a0 b0 c0 d0 e0 f0  88 98 a8 b8 c8 d8 e8 f8
+  // 81 91 a1 b1 c1 d1 e1 f1  89 99 a9 b9 c9 d9 e9 f9
+  // 82 92 a2 b2 c2 d2 e2 f2  8a 9a aa ba ca da ea fa
+  // 83 93 a3 b3 c3 d3 e3 f3  8b 9b ab bb cb db eb fb
+  // 84 94 a4 b4 c4 d4 e4 f4  8c 9c ac bc cc dc ef fc
+  // 85 95 a5 b5 c5 d5 e5 f5  8d 9d ad bd cd dd ed fd
+  // 86 96 a6 b6 c6 d6 e6 f6  8e ae 9e be ce de ee fe
+  // 87 97 a7 b7 c7 d7 e7 f7  8f 9f af bf cf df ef ff
+
+  out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20);  // 0010 0000
+  out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31);  // 0011 0001
+  out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+  out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+  out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+  out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+  out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+  out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+  out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+  out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+  out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+  out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+  out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+  out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+  out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+  out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif  // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
new file mode 100644
index 0000000000..4ca214f469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                uint8_t *output_ptr, ptrdiff_t out_pitch,
+                                uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)         \
+  void aom_convolve8_##name##_##opt(                                         \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {                \
+    (void)filter_x;                                                          \
+    (void)x_step_q4;                                                         \
+    (void)filter_y;                                                          \
+    (void)y_step_q4;                                                         \
+    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
+    assert(step_q4 == 16);                                                   \
+    if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&            \
+        (filter[2] | filter[5])) {                                           \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else if (filter[0] | filter[1] | filter[2]) {                          \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    } else {                                                                 \
+      while (w >= 16) {                                                      \
+        aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst,       \
+                                                 dst_stride, h, filter);     \
+        src += 16;                                                           \
+        dst += 16;                                                           \
+        w -= 16;                                                             \
+      }                                                                      \
+      while (w >= 8) {                                                       \
+        aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 8;                                                            \
+        dst += 8;                                                            \
+        w -= 8;                                                              \
+      }                                                                      \
+      while (w >= 4) {                                                       \
+        aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst,        \
+                                                dst_stride, h, filter);      \
+        src += 4;                                                            \
+        dst += 4;                                                            \
+        w -= 4;                                                              \
+      }                                                                      \
+    }                                                                        \
+    if (w) {                                                                 \
+      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
+                               x_step_q4, filter_y, y_step_q4, w, h);        \
+    }                                                                        \
+  }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+                                       const ptrdiff_t src_pitch,
+                                       uint16_t *output_ptr,
+                                       ptrdiff_t out_pitch,
+                                       unsigned int output_height,
+                                       const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt)  \
+  void aom_highbd_convolve8_##name##_##opt(                                \
+      const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,            \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,        \
+      const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {      \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                             \
+    if (step_q4 == 16 && filter[3] != 128) {                               \
+      if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) &&        \
+          (filter[2] | filter[5])) {                                       \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##4_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##4_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else if (filter[0] | filter[1] | filter[2]) {                      \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##8_##avg##opt(                 \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##8_##avg##opt(                  \
+              src_start, src_stride, dst, dst_stride, h, filter, bd);      \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      } else {                                                             \
+        while (w >= 16) {                                                  \
+          aom_highbd_filter_block1d16_##dir##2_##avg##opt(                 \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 16;                                                       \
+          dst += 16;                                                       \
+          w -= 16;                                                         \
+        }                                                                  \
+        while (w >= 8) {                                                   \
+          aom_highbd_filter_block1d8_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 8;                                                        \
+          dst += 8;                                                        \
+          w -= 8;                                                          \
+        }                                                                  \
+        while (w >= 4) {                                                   \
+          aom_highbd_filter_block1d4_##dir##2_##avg##opt(                  \
+              src, src_stride, dst, dst_stride, h, filter, bd);            \
+          src += 4;                                                        \
+          dst += 4;                                                        \
+          w -= 4;                                                          \
+        }                                                                  \
+      }                                                                    \
+    }                                                                      \
+    if (w) {                                                               \
+      aom_highbd_convolve8_##name##_c(                                     \
+          CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst),    \
+          dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+    }                                                                      \
+  }
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..f5a382ce4e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+  0,  1,  1,  2,  2, 3,  3,  4,  4,  5,  5,  6,  6,  7,  7,  8,  0,  1,  1,
+  2,  2,  3,  3,  4, 4,  5,  5,  6,  6,  7,  7,  8,  2,  3,  3,  4,  4,  5,
+  5,  6,  6,  7,  7, 8,  8,  9,  9,  10, 2,  3,  3,  4,  4,  5,  5,  6,  6,
+  7,  7,  8,  8,  9, 9,  10, 4,  5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10,
+  10, 11, 11, 12, 4, 5,  5,  6,  6,  7,  7,  8,  8,  9,  9,  10, 10, 11, 11,
+  12, 6,  7,  7,  8, 8,  9,  9,  10, 10, 11, 11, 12, 12, 13, 13, 14, 6,  7,
+  7,  8,  8,  9,  9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+  0, 1, 2, 3,  1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3,  1, 2,
+  3, 4, 2, 3,  4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7,  8, 9,
+  7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+  2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+  3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+                                           6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+                                           3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+                filt2_global_avx2[32]) = { 2, 3, 3, 4, 4,  5, 5, 6, 6, 7, 7,
+                                           8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+                                           5, 6, 6, 7, 7,  8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+  4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+  6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+    __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt);             \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+  __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt);             \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_4TAP                                      \
+  __m256i s[6];                                                               \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+    const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                     \
+    s[5] = _mm256_unpackhi_epi16(s4, s5);                                     \
+                                                                              \
+    __m256i res_a = convolve_4tap(s, coeffs_v + 1);                           \
+    __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1);                       \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt);                 \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt);                 \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_6TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+                                                                              \
+  s[3] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[4] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+                                                                              \
+    s[2] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[5] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve_6tap(s, coeffs_v);                               \
+    __m256i res_b = convolve_6tap(s + 3, coeffs_v);                           \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+                                                                              \
+    s[3] = s[4];                                                              \
+    s[4] = s[5];                                                              \
+  }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP                                     \
+  for (i = 0; i < (im_h - 2); i += 2) {                                        \
+    __m256i data = _mm256_castsi128_si256(                                     \
+        _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));           \
+    data = _mm256_inserti128_si256(                                            \
+        data,                                                                  \
+        _mm_loadu_si128(                                                       \
+            (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]),           \
+        1);                                                                    \
+                                                                               \
+    __m256i res = convolve_lowbd_x(data, coeffs_h, filt);                      \
+    res =                                                                      \
+        _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+    _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);              \
+  }                                                                            \
+                                                                               \
+  __m256i data_1 = _mm256_castsi128_si256(                                     \
+      _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j]));             \
+                                                                               \
+  __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt);                      \
+                                                                               \
+  res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+                                                                               \
+  _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));  \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));  \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));  \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));  \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));  \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));  \
+                                                                              \
+  __m256i s[8];                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                 \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                 \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                 \
+                                                                              \
+  s[4] = _mm256_unpackhi_epi16(src_0, src_1);                                 \
+  s[5] = _mm256_unpackhi_epi16(src_2, src_3);                                 \
+  s[6] = _mm256_unpackhi_epi16(src_4, src_5);                                 \
+                                                                              \
+  for (i = 0; i < h; i += 2) {                                                \
+    const int16_t *data = &im_block[i * im_stride];                           \
+                                                                              \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+                                                                              \
+    s[3] = _mm256_unpacklo_epi16(s6, s7);                                     \
+    s[7] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                              \
+    __m256i res_a = convolve(s, coeffs_v);                                    \
+    __m256i res_b = convolve(s + 4, coeffs_v);                                \
+                                                                              \
+    res_a =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);  \
+    res_b =                                                                   \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);  \
+                                                                              \
+    const __m256i res_a_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);               \
+    const __m256i res_b_round = _mm256_sra_epi32(                             \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);               \
+                                                                              \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);   \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);         \
+                                                                              \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                     \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                \
+                                                                              \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                 \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];    \
+    if (w - j > 4) {                                                          \
+      _mm_storel_epi64(p_0, res_0);                                           \
+      _mm_storel_epi64(p_1, res_1);                                           \
+    } else if (w == 4) {                                                      \
+      xx_storel_32(p_0, res_0);                                               \
+      xx_storel_32(p_1, res_1);                                               \
+    } else {                                                                  \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                  \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                  \
+    }                                                                         \
+                                                                              \
+    s[0] = s[1];                                                              \
+    s[1] = s[2];                                                              \
+    s[2] = s[3];                                                              \
+                                                                              \
+    s[4] = s[5];                                                              \
+    s[5] = s[6];                                                              \
+    s[6] = s[7];                                                              \
+  }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP                                    \
+  const __m256i v_zero = _mm256_setzero_si256();                               \
+  __m256i s[12];                                                               \
+  if (w <= 4) {                                                                \
+    for (i = 0; i < im_h; i += 2) {                                            \
+      const __m256i data = _mm256_permute2x128_si256(                          \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
+          _mm256_castsi128_si256(_mm_loadu_si128(                              \
+              (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))),        \
+          0x20);                                                               \
+      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
+      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
+      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
+      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
+                                                                               \
+      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
+      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
+                                                                               \
+      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
+      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
+      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
+      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
+      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
+      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
+                                                                               \
+      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
+                                                                               \
+      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
+          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
+      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
+      const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0);           \
+      const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1);           \
+      if (w > 2) {                                                             \
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0);          \
+        _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride],      \
+                         res_1);                                               \
+      } else {                                                                 \
+        uint32_t horiz_2;                                                      \
+        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0);                          \
+        im_block[i * im_stride] = (uint16_t)horiz_2;                           \
+        im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16);               \
+        horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1);                          \
+        im_block[i * im_stride + im_stride] = (uint16_t)horiz_2;               \
+        im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16);   \
+      }                                                                        \
+    }                                                                          \
+  } else {                                                                     \
+    for (i = 0; i < im_h; i++) {                                               \
+      const __m256i data = _mm256_permute2x128_si256(                          \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))),     \
+          _mm256_castsi128_si256(                                              \
+              _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
+          0x20);                                                               \
+      const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero);               \
+      const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero);               \
+                                                                               \
+      const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo);            \
+      const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo);            \
+                                                                               \
+      const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi);            \
+      const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi);            \
+                                                                               \
+      s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2);                            \
+      s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10);                           \
+      s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2);                            \
+      s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10);                           \
+      s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2);                            \
+      s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10);                           \
+                                                                               \
+      const __m256i res_lo = convolve_12taps(s, coeffs_h);                     \
+                                                                               \
+      __m256i res_32b_lo = _mm256_sra_epi32(                                   \
+          _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12);         \
+                                                                               \
+      __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo);         \
+      _mm_store_si128((__m128i *)&im_block[i * im_stride],                     \
+                      _mm256_extracti128_si256(                                \
+                          _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0));     \
+    }                                                                          \
+  }
+
+#define CONVOLVE_SR_VERTICAL_FILTER_12TAP                                      \
+  __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));   \
+  __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));   \
+  __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));   \
+  __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));   \
+  __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));   \
+  __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));   \
+  __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride));   \
+  __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride));   \
+  __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride));   \
+  __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride));   \
+                                                                               \
+  s[0] = _mm256_unpacklo_epi16(src_0, src_1);                                  \
+  s[1] = _mm256_unpacklo_epi16(src_2, src_3);                                  \
+  s[2] = _mm256_unpacklo_epi16(src_4, src_5);                                  \
+  s[3] = _mm256_unpacklo_epi16(src_6, src_7);                                  \
+  s[4] = _mm256_unpacklo_epi16(src_8, src_9);                                  \
+                                                                               \
+  s[6] = _mm256_unpackhi_epi16(src_0, src_1);                                  \
+  s[7] = _mm256_unpackhi_epi16(src_2, src_3);                                  \
+  s[8] = _mm256_unpackhi_epi16(src_4, src_5);                                  \
+  s[9] = _mm256_unpackhi_epi16(src_6, src_7);                                  \
+  s[10] = _mm256_unpackhi_epi16(src_8, src_9);                                 \
+                                                                               \
+  for (i = 0; i < h; i += 2) {                                                 \
+    const int16_t *data = &im_block[i * im_stride];                            \
+                                                                               \
+    const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
+    const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
+                                                                               \
+    s[5] = _mm256_unpacklo_epi16(s6, s7);                                      \
+    s[11] = _mm256_unpackhi_epi16(s6, s7);                                     \
+                                                                               \
+    __m256i res_a = convolve_12taps(s, coeffs_v);                              \
+    __m256i res_b = convolve_12taps(s + 6, coeffs_v);                          \
+                                                                               \
+    res_a =                                                                    \
+        _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v);   \
+    res_b =                                                                    \
+        _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v);   \
+                                                                               \
+    const __m256i res_a_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_a, round_const_v), round_shift_v);                \
+    const __m256i res_b_round = _mm256_sra_epi32(                              \
+        _mm256_add_epi32(res_b, round_const_v), round_shift_v);                \
+                                                                               \
+    const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);    \
+    const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit);          \
+                                                                               \
+    const __m128i res_0 = _mm256_castsi256_si128(res_8b);                      \
+    const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1);                 \
+                                                                               \
+    __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];                  \
+    __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride];     \
+    if (w - j > 4) {                                                           \
+      _mm_storel_epi64(p_0, res_0);                                            \
+      _mm_storel_epi64(p_1, res_1);                                            \
+    } else if (w == 4) {                                                       \
+      xx_storel_32(p_0, res_0);                                                \
+      xx_storel_32(p_1, res_1);                                                \
+    } else {                                                                   \
+      *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);                   \
+      *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);                   \
+    }                                                                          \
+                                                                               \
+    s[0] = s[1];                                                               \
+    s[1] = s[2];                                                               \
+    s[2] = s[3];                                                               \
+    s[3] = s[4];                                                               \
+    s[4] = s[5];                                                               \
+                                                                               \
+    s[6] = s[7];                                                               \
+    s[7] = s[8];                                                               \
+    s[8] = s[9];                                                               \
+    s[9] = s[10];                                                              \
+    s[10] = s[11];                                                             \
+  }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP                        \
+  do {                                                                  \
+    for (i = 0; i < im_h; i += 2) {                                     \
+      __m256i data =                                                    \
+          _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h));    \
+      if (i + 1 < im_h)                                                 \
+        data = _mm256_inserti128_si256(                                 \
+            data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
+      src_h += (src_stride << 1);                                       \
+      __m256i res = convolve_lowbd_x(data, coeffs_x, filt);             \
+                                                                        \
+      res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h),      \
+                             round_shift_h);                            \
+                                                                        \
+      _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);     \
+    }                                                                   \
+  } while (0)
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP                                 \
+  do {                                                                         \
+    __m256i s[8];                                                              \
+    __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));    \
+    __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));    \
+    __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));    \
+    __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));    \
+    __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));    \
+    __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));    \
+                                                                               \
+    s[0] = _mm256_unpacklo_epi16(s0, s1);                                      \
+    s[1] = _mm256_unpacklo_epi16(s2, s3);                                      \
+    s[2] = _mm256_unpacklo_epi16(s4, s5);                                      \
+                                                                               \
+    s[4] = _mm256_unpackhi_epi16(s0, s1);                                      \
+    s[5] = _mm256_unpackhi_epi16(s2, s3);                                      \
+    s[6] = _mm256_unpackhi_epi16(s4, s5);                                      \
+                                                                               \
+    for (i = 0; i < h; i += 2) {                                               \
+      const int16_t *data = &im_block[i * im_stride];                          \
+                                                                               \
+      const __m256i s6 =                                                       \
+          _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));               \
+      const __m256i s7 =                                                       \
+          _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));               \
+                                                                               \
+      s[3] = _mm256_unpacklo_epi16(s6, s7);                                    \
+      s[7] = _mm256_unpackhi_epi16(s6, s7);                                    \
+                                                                               \
+      const __m256i res_a = convolve(s, coeffs_y);                             \
+      const __m256i res_a_round = _mm256_sra_epi32(                            \
+          _mm256_add_epi32(res_a, round_const_v), round_shift_v);              \
+                                                                               \
+      if (w - j > 4) {                                                         \
+        const __m256i res_b = convolve(s + 4, coeffs_y);                       \
+        const __m256i res_b_round = _mm256_sra_epi32(                          \
+            _mm256_add_epi32(res_b, round_const_v), round_shift_v);            \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round);  \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
+                                                                               \
+        if (do_average) {                                                      \
+          const __m256i data_ref_0 =                                           \
+              load_line2_avx2(&dst[i * dst_stride + j],                        \
+                              &dst[i * dst_stride + j + dst_stride]);          \
+          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
+                                                &wt, use_dist_wtd_comp_avg);   \
+                                                                               \
+          const __m256i round_result = convolve_rounding(                      \
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
+                                                                               \
+          const __m256i res_8 =                                                \
+              _mm256_packus_epi16(round_result, round_result);                 \
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
+                                                                               \
+          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);    \
+          _mm_storel_epi64(                                                    \
+              (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
+        } else {                                                               \
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
+                                                                               \
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
+                          res_1);                                              \
+        }                                                                      \
+      } else {                                                                 \
+        const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round);  \
+        const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const);  \
+                                                                               \
+        if (do_average) {                                                      \
+          const __m256i data_ref_0 =                                           \
+              load_line2_avx2(&dst[i * dst_stride + j],                        \
+                              &dst[i * dst_stride + j + dst_stride]);          \
+                                                                               \
+          const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned,    \
+                                                &wt, use_dist_wtd_comp_avg);   \
+                                                                               \
+          const __m256i round_result = convolve_rounding(                      \
+              &comp_avg_res, &offset_const, &rounding_const, rounding_shift);  \
+                                                                               \
+          const __m256i res_8 =                                                \
+              _mm256_packus_epi16(round_result, round_result);                 \
+          const __m128i res_0 = _mm256_castsi256_si128(res_8);                 \
+          const __m128i res_1 = _mm256_extracti128_si256(res_8, 1);            \
+                                                                               \
+          *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0);     \
+          *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) =                 \
+              _mm_cvtsi128_si32(res_1);                                        \
+                                                                               \
+        } else {                                                               \
+          const __m128i res_0 = _mm256_castsi256_si128(res_unsigned);          \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);       \
+                                                                               \
+          const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1);     \
+          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),  \
+                          res_1);                                              \
+        }                                                                      \
+      }                                                                        \
+                                                                               \
+      s[0] = s[1];                                                             \
+      s[1] = s[2];                                                             \
+      s[2] = s[3];                                                             \
+                                                                               \
+      s[4] = s[5];                                                             \
+      s[5] = s[6];                                                             \
+      s[6] = s[7];                                                             \
+    }                                                                          \
+  } while (0)
+
+static INLINE void prepare_coeffs_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((short)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs_6t_lowbd(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+  // right shift all filter co-efficients by 1 to reduce the bits required.
+  // This extra right shift will be taken care of at the end while rounding
+  // the result.
+  // Since all filter co-efficients are even, this change will not affect the
+  // end result
+  assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+                            _mm_set1_epi16((int16_t)0xffff)));
+
+  const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
+static INLINE void prepare_coeffs_6t(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 1 2 1 2 1 2 1 2
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 3 4 3 4 3 4 3 4
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 5 6 5 6 5 6 5 6
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void prepare_coeffs_12taps(
+    const InterpFilterParams *const filter_params, const int subpel_q4,
+    __m256i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+  __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+  // coeffs 8 9 10 11 0 0 0 0
+  coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
+  coeff = _mm256_broadcastq_epi64(coeff_8);
+  coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00);  // coeffs 8 9 8 9 8 9 8 9
+  coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55);  // coeffs 10 11 10 11.. 10 11
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+                                     const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+  const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+                                       _mm256_add_epi16(res_23, res_67));
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res =
+      _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+                                          const __m256i *const coeffs) {
+  const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+  const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+  // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+  return res;
+}
+
+static INLINE __m256i convolve_6tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
+
+  return res;
+}
+
+static INLINE __m256i convolve_12taps(const __m256i *const s,
+                                      const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+  const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
+  const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
+
+  const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                        _mm256_add_epi32(res_2, res_3));
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
+
+  return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+                               const __m256i *const coeffs) {
+  const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+  const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+  const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+  const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+                                       _mm256_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m256i convolve_4tap(const __m256i *const s,
+                                    const __m256i *const coeffs) {
+  const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
+  const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
+
+  const __m256i res = _mm256_add_epi32(res_1, res_2);
+  return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+                                       const __m256i *const coeffs,
+                                       const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+  s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+  return convolve_lowbd(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[4];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+  s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+  return convolve_lowbd_6tap(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+                                            const __m256i *const coeffs,
+                                            const __m256i *const filt) {
+  __m256i s[2];
+
+  s[0] = _mm256_shuffle_epi8(data, filt[0]);
+  s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+  return convolve_lowbd_4tap(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+                                         const __m256i *const res,
+                                         const int do_average) {
+  __m256i d;
+  if (do_average) {
+    d = _mm256_load_si256((__m256i *)dst);
+    d = _mm256_add_epi32(d, *res);
+    d = _mm256_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+                               const __m256i *const res_unsigned,
+                               const __m256i *const wt,
+                               const int use_dist_wtd_comp_avg) {
+  __m256i res;
+  if (use_dist_wtd_comp_avg) {
+    const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+    const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+    const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm256_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+                                        const __m256i *const offset_const,
+                                        const __m256i *const round_const,
+                                        const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi16(
+      _mm256_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+                                      const __m256i *const res_unsigned,
+                                      const __m256i *const wt0,
+                                      const __m256i *const wt1,
+                                      const int use_dist_wtd_comp_avg) {
+  __m256i res;
+  if (use_dist_wtd_comp_avg) {
+    const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+    const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+    const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+    res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm256_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+    const __m256i *const res_unsigned, const __m256i *const offset_const,
+    const __m256i *const round_const, const int round_shift) {
+  const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+  const __m256i res_round = _mm256_srai_epi32(
+      _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 0000000000..9e8662af46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+                             const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(d, *res);
+    d = _mm_srai_epi32(d, 1);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
+                                        int subpel_q4,
+                                        __m128i *coeffs /* [6] */) {
+  const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+
+  __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+  coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0);    // coeffs 0 1 0 1 0 1 0 1
+  coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85);   // coeffs 2 3 2 3 2 3 2 3
+  coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170);  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255);  // coeffs 6 7 6 7 6 7 6 7
+
+  coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8));
+
+  coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0);  // coeffs 8 9 8 9 8 9 8 9
+  coeffs[5] =
+      _mm_shuffle_epi32(coeffs_y, 85);  // coeffs 10 11 10 11 10 11 10 11
+}
+
+static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
+  const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+  const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]);
+  const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]);
+  const __m128i d_0123 =
+      _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+  const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123);
+  return d;
+}
+
+static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
+                                          const __m128i *coeffs,
+                                          const __m128i zero) {
+  __m128i ss[6];
+  ss[0] = _mm_unpacklo_epi8(s[0], zero);  //  0  1  1  2  2  3  3  4
+  ss[1] = _mm_unpacklo_epi8(s[1], zero);  //  2  3  3  4  4  5  5  6
+  ss[2] = _mm_unpacklo_epi8(s[2], zero);  //  4  5  5  6  6  7  7  8
+  ss[3] = _mm_unpacklo_epi8(s[3], zero);  //  6  7  7  8  8  9  9 10
+  ss[4] = _mm_unpackhi_epi8(s[2], zero);  //  8  9  9 10 10 11 11 12
+  ss[5] = _mm_unpackhi_epi8(s[3], zero);  // 10 11 11 12 12 13 13 14
+  return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpacklo_epi8(s[0], zero);
+  ss[1] = _mm_unpacklo_epi8(s[2], zero);
+  ss[2] = _mm_unpacklo_epi8(s[4], zero);
+  ss[3] = _mm_unpacklo_epi8(s[6], zero);
+  ss[4] = _mm_unpacklo_epi8(s[8], zero);
+  ss[5] = _mm_unpacklo_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpackhi_epi8(s[0], zero);
+  ss[1] = _mm_unpackhi_epi8(s[2], zero);
+  ss[2] = _mm_unpackhi_epi8(s[4], zero);
+  ss[3] = _mm_unpackhi_epi8(s[6], zero);
+  ss[4] = _mm_unpackhi_epi8(s[8], zero);
+  ss[5] = _mm_unpackhi_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..36b7d62b98
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+#include "config/aom_scale_rtcd.h"
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+                                  const int subpel_q4,
+                                  __m128i *const coeffs /* [4] */) {
+  const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+      filter_params, subpel_q4 & SUBPEL_MASK);
+  const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+  // coeffs 0 1 0 1 0 1 0 1
+  coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+  // coeffs 2 3 2 3 2 3 2 3
+  coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+  // coeffs 6 7 6 7 6 7 6 7
+  coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+                               const __m128i *const coeffs) {
+  const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+  const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+  const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+  const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+  const __m128i res =
+      _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+  return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+                                    const __m128i *const coeffs) {
+  __m128i ss[4];
+  ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+  ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+  ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+  ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+  return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+                               const __m128i *const res_unsigned,
+                               const __m128i *const wt,
+                               const int use_dist_wtd_avg) {
+  __m128i res;
+  if (use_dist_wtd_avg) {
+    const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+    const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+    const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+    const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+    const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+    const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+    res = _mm_packs_epi32(res_lo, res_hi);
+  } else {
+    const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi16(wt_res, 1);
+  }
+  return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+                                        const __m128i *const offset_const,
+                                        const __m128i *const round_const,
+                                        const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+  return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+    const __m128i *const res_unsigned, const __m128i *const offset_const,
+    const __m128i *const round_const, const int round_shift) {
+  const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+  const __m128i res_round =
+      _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+  return res_round;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 0000000000..b1a3bb4664
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+
+// Note:
+//  This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+                                  const __m128i *const res,
+                                  const __m128i *const wt0,
+                                  const __m128i *const wt1,
+                                  const int do_average) {
+  __m128i d;
+  if (do_average) {
+    d = _mm_load_si128((__m128i *)dst);
+    d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+    d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+  } else {
+    d = *res;
+  }
+  _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+                                             const __m128i *const res_unsigned,
+                                             const __m128i *const wt0,
+                                             const __m128i *const wt1,
+                                             const int use_dist_wtd_avg) {
+  __m128i res;
+  if (use_dist_wtd_avg) {
+    const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+    const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+    const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+    res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+  } else {
+    const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+    res = _mm_srai_epi32(wt_res, 1);
+  }
+  return res;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_ssse3.h b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..b1abead146
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <tmmintrin.h>  // SSSE3
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+                                        __m128i *const f) {
+  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+  // pack and duplicate the filter values
+  f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+  f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+  f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+  f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+                                        const __m128i *const f) {
+  // multiply 2 adjacent elements with the filter and add the result
+  const __m128i k_64 = _mm_set1_epi16(1 << 6);
+  const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+  const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+  const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+  const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+  __m128i sum1, sum2;
+
+  // sum the results together, saturating only on the final step
+  // adding x0 with x2 and x1 with x3 is the only order that prevents
+  // outranges for all filters
+  sum1 = _mm_add_epi16(x0, x2);
+  sum2 = _mm_add_epi16(x1, x3);
+  // add the rounding offset early to avoid another saturated add
+  sum1 = _mm_add_epi16(sum1, k_64);
+  sum1 = _mm_adds_epi16(sum1, sum2);
+  // shift by 7 bit each 16 bit
+  sum1 = _mm_srai_epi16(sum1, 7);
+  return sum1;
+}
+
+#endif  // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 0000000000..3f5a9bbeff
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+                                          int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+          _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+          _mm256_mul_ps)
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps)
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps)
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+           _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+           _mm256_mul_ps)
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps)
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+            _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+            _mm256_mul_ps)
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+                  aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+                  aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+                  aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 0000000000..bdd235bcd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+                                const int ldb) {
+  __m128 row1 = _mm_load_ps(&A[0 * lda]);
+  __m128 row2 = _mm_load_ps(&A[1 * lda]);
+  __m128 row3 = _mm_load_ps(&A[2 * lda]);
+  __m128 row4 = _mm_load_ps(&A[3 * lda]);
+  _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+  _mm_store_ps(&B[0 * ldb], row1);
+  _mm_store_ps(&B[1 * ldb], row2);
+  _mm_store_ps(&B[2 * ldb], row3);
+  _mm_store_ps(&B[3 * ldb], row4);
+}
+
+// Referenced by fft_avx2.c.
+void aom_transpose_float_sse2(const float *A, float *B, int n);
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+  for (int y = 0; y < n; y += 4) {
+    for (int x = 0; x < n; x += 4) {
+      transpose4x4(A + y * n + x, B + x * n + y, n, n);
+    }
+  }
+}
+
+// Referenced by fft_avx2.c.
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+  const int n2 = n / 2;
+  output[0] = packed[0];
+  output[1] = 0;
+  output[2 * (n2 * n)] = packed[n2 * n];
+  output[2 * (n2 * n) + 1] = 0;
+
+  output[2 * n2] = packed[n2];
+  output[2 * n2 + 1] = 0;
+  output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+  output[2 * (n2 * n + n2) + 1] = 0;
+
+  for (int c = 1; c < n2; ++c) {
+    output[2 * (0 * n + c)] = packed[c];
+    output[2 * (0 * n + c) + 1] = packed[c + n2];
+    output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+    output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+  }
+  for (int r = 1; r < n2; ++r) {
+    output[2 * (r * n + 0)] = packed[r * n];
+    output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+    output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+    output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+    for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+      output[2 * (r * n + c)] =
+          packed[r * n + c] - packed[(r + n2) * n + c + n2];
+      output[2 * (r * n + c) + 1] =
+          packed[(r + n2) * n + c] + packed[r * n + c + n2];
+    }
+
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+      real1 = _mm_sub_ps(real1, real2);
+      imag1 = _mm_add_ps(imag1, imag2);
+      _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+    }
+
+    int r2 = r + n2;
+    int r3 = n - r2;
+    output[2 * (r2 * n + 0)] = packed[r3 * n];
+    output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+    output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+    output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+    for (int c = 1; c < AOMMIN(4, n2); ++c) {
+      output[2 * (r2 * n + c)] =
+          packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+      output[2 * (r2 * n + c) + 1] =
+          -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+    }
+    for (int c = 4; c < n2; c += 4) {
+      __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+      __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+      __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+      __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+      real1 = _mm_add_ps(real1, real2);
+      imag1 = _mm_sub_ps(imag2, imag1);
+      _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+      _mm_store_ps(output + 2 * (r2 * n + c + 2),
+                   _mm_unpackhi_ps(real1, imag1));
+    }
+  }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+          _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+                 aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+           _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+            _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+                  aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+                  aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+                  aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+                  aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+  aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+                  aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+                  aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..7ee8ba330e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+                              __m128i *in1) {
+  // Constants
+  // These are the coefficients used for the multiplies.
+  // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+  // where cospi_N_64 = cos(N pi /64)
+  const __m128i k__cospi_A =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_B =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_C =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_D =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_E =
+      octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+                     cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+  const __m128i k__cospi_F =
+      octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+                     cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_G =
+      octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+                     -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_H =
+      octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+                     -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+  // This second rounding constant saves doing some extra adds at the end
+  const __m128i k__DCT_CONST_ROUNDING2 =
+      _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+  const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+  // Load inputs.
+  *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+  *in1 = _mm_unpacklo_epi64(
+      *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+  *in0 = _mm_unpacklo_epi64(
+      *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+  // in0 = [i0 i1 i2 i3 iC iD iE iF]
+  // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+  // multiply by 16 to give some extra precision
+  *in0 = _mm_slli_epi16(*in0, 4);
+  *in1 = _mm_slli_epi16(*in1, 4);
+  // if (i == 0 && input[0]) input[0] += 1;
+  // add 1 to the upper left pixel if it is non-zero, which helps reduce
+  // the round-trip error
+  {
+    // The mask will only contain whether the first value is zero, all
+    // other comparison will fail as something shifted by 4 (above << 4)
+    // can never be equal to one. To increment in the non-zero case, we
+    // add the mask and one for the first element:
+    //   - if zero, mask = -1, v = v - 1 + 1 = v
+    //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+    __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+    *in0 = _mm_add_epi16(*in0, mask);
+    *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+  }
+  // There are 4 total stages, alternating between an add/subtract stage
+  // followed by an multiply-and-add stage.
+  {
+    // Stage 1: Add/subtract
+
+    // in0 = [i0 i1 i2 i3 iC iD iE iF]
+    // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+    const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+    const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+    // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+    // r1 = [iC i8 iD i9 iE iA iF iB]
+    const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+    const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+    // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+    // r3 = [iC i8 iD i9 iF iB iE iA]
+
+    const __m128i t0 = _mm_add_epi16(r2, r3);
+    const __m128i t1 = _mm_sub_epi16(r2, r3);
+    // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+    // t1 = [aC a8 aD a9 aF aB aE aA]
+
+    // Stage 2: multiply by constants (which gets us into 32 bits).
+    // The constants needed here are:
+    // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+    // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+    // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+    // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+    const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+    const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+    // Then add and right-shift to get back to 16-bit range
+    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+    const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+    const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+    const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+    const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+    // w0 = [b0 b1 b7 b6]
+    // w1 = [b8 b9 bF bE]
+    // w2 = [b4 b5 b3 b2]
+    // w3 = [bC bD bB bA]
+    const __m128i x0 = _mm_packs_epi32(w0, w1);
+    const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+    // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+    // x1 = [b4 b5 b3 b2 bC bD bB bA]
+    *in0 = _mm_shuffle_epi32(x0, 0xD8);
+    *in1 = _mm_shuffle_epi32(x1, 0x8D);
+    // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+    // in1 = [b3 b2 bB bA b4 b5 bC bD]
+  }
+  {
+    // vertical DCTs finished. Now we do the horizontal DCTs.
+    // Stage 3: Add/subtract
+
+    const __m128i t0 = ADD_EPI16(*in0, *in1);
+    const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+    // Stage 4: multiply by constants (which gets us into 32 bits).
+    {
+      // The constants needed here are:
+      // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+      // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+      // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+      // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+      const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+      const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+      // Then add and right-shift to get back to 16-bit range
+      // but this combines the final right-shift as well to save operations
+      // This unusual rounding operations is to maintain bit-accurate
+      // compatibility with the c version of this function which has two
+      // rounding steps in a row.
+      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+      *in0 = _mm_packs_epi32(w0, w2);
+      *in1 = _mm_packs_epi32(w1, w3);
+    }
+  }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+  // This 2D transform implements 4 vertical 1D transforms followed
+  // by 4 horizontal 1D transforms.  The multiplies and adds are as given
+  // by Chen, Smith and Fralick ('77).  The commands for moving the data
+  // around have been minimized by hand.
+  // For the purposes of the comments, the 16 inputs are referred to at i0
+  // through iF (in raster order), intermediate variables are a0, b0, c0
+  // through f, and correspond to the in-place computations mapped to input
+  // locations.  The outputs, o0 through oF are labeled according to the
+  // output locations.
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+  // Post-condition (v + 1) >> 2 is now incorporated into previous
+  // add and right-shift commands.  Only 2 store instructions needed
+  // because we are using the fact that 1/3 are stored just after 0/2.
+  storeu_output(&in0, output + 0 * 4);
+  storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+  __m128i in0, in1;
+  FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+  _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+  _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
+#if CONFIG_INTERNAL_STATS
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+  int pass;
+  // Constants
+  //    When we use them, in one case, they are all the same. In all others
+  //    it's a pair of them that we need to repeat four times. This is done
+  //    by constructing the 32 bit constant corresponding to that pair.
+  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+  int overflow;
+#endif
+  // Load input
+  __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+  __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+  __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+  __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+  __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+  __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+  __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  // Pre-condition input (shift by two)
+  in0 = _mm_slli_epi16(in0, 2);
+  in1 = _mm_slli_epi16(in1, 2);
+  in2 = _mm_slli_epi16(in2, 2);
+  in3 = _mm_slli_epi16(in3, 2);
+  in4 = _mm_slli_epi16(in4, 2);
+  in5 = _mm_slli_epi16(in5, 2);
+  in6 = _mm_slli_epi16(in6, 2);
+  in7 = _mm_slli_epi16(in7, 2);
+
+  // We do two passes, first the columns, then the rows. The results of the
+  // first pass are transposed so that the same column code can be reused. The
+  // results of the second pass are also transposed so that the rows (processed
+  // as columns) are put back in row positions.
+  for (pass = 0; pass < 2; pass++) {
+    // To store results of each pass before the transpose.
+    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+    // Add/subtract
+    const __m128i q0 = ADD_EPI16(in0, in7);
+    const __m128i q1 = ADD_EPI16(in1, in6);
+    const __m128i q2 = ADD_EPI16(in2, in5);
+    const __m128i q3 = ADD_EPI16(in3, in4);
+    const __m128i q4 = SUB_EPI16(in3, in4);
+    const __m128i q5 = SUB_EPI16(in2, in5);
+    const __m128i q6 = SUB_EPI16(in1, in6);
+    const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+    if (pass == 1) {
+      overflow =
+          check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+    }
+#endif  // DCT_HIGH_BIT_DEPTH
+    // Work on first four results
+    {
+      // Add/subtract
+      const __m128i r0 = ADD_EPI16(q0, q3);
+      const __m128i r1 = ADD_EPI16(q1, q2);
+      const __m128i r2 = SUB_EPI16(q1, q2);
+      const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      // Interleave to do the multiply by constants which gets us into 32bits
+      {
+        const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+        const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+        const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+        const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+        const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+        const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+        const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+        const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+        const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+        const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+        const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+        const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+        // dct_const_round_shift
+        const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+        const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+        const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+        const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+        const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+        const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+        const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+        const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+        const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+        const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+        const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+        const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+        const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+        const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+        const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+        const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+        // Combine
+        res0 = _mm_packs_epi32(w0, w1);
+        res4 = _mm_packs_epi32(w2, w3);
+        res2 = _mm_packs_epi32(w4, w5);
+        res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+      }
+    }
+    // Work on next four results
+    {
+      // Interleave to do the multiply by constants which gets us into 32bits
+      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+      // dct_const_round_shift
+      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+      // Combine
+      const __m128i r0 = _mm_packs_epi32(s0, s1);
+      const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+      overflow = check_epi16_overflow_x2(&r0, &r1);
+      if (overflow) {
+        aom_highbd_fdct8x8_c(input, output, stride);
+        return;
+      }
+#endif  // DCT_HIGH_BIT_DEPTH
+      {
+        // Add/subtract
+        const __m128i x0 = ADD_EPI16(q4, r0);
+        const __m128i x1 = SUB_EPI16(q4, r0);
+        const __m128i x2 = SUB_EPI16(q7, r1);
+        const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+        overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+        if (overflow) {
+          aom_highbd_fdct8x8_c(input, output, stride);
+          return;
+        }
+#endif  // DCT_HIGH_BIT_DEPTH
+        // Interleave to do the multiply by constants which gets us into 32bits
+        {
+          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+          // dct_const_round_shift
+          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+          // Combine
+          res1 = _mm_packs_epi32(w0, w1);
+          res7 = _mm_packs_epi32(w2, w3);
+          res5 = _mm_packs_epi32(w4, w5);
+          res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+          overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+          if (overflow) {
+            aom_highbd_fdct8x8_c(input, output, stride);
+            return;
+          }
+#endif  // DCT_HIGH_BIT_DEPTH
+        }
+      }
+    }
+    // Transpose the 8x8.
+    {
+      // 00 01 02 03 04 05 06 07
+      // 10 11 12 13 14 15 16 17
+      // 20 21 22 23 24 25 26 27
+      // 30 31 32 33 34 35 36 37
+      // 40 41 42 43 44 45 46 47
+      // 50 51 52 53 54 55 56 57
+      // 60 61 62 63 64 65 66 67
+      // 70 71 72 73 74 75 76 77
+      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+      // 00 10 01 11 02 12 03 13
+      // 20 30 21 31 22 32 23 33
+      // 04 14 05 15 06 16 07 17
+      // 24 34 25 35 26 36 27 37
+      // 40 50 41 51 42 52 43 53
+      // 60 70 61 71 62 72 63 73
+      // 54 54 55 55 56 56 57 57
+      // 64 74 65 75 66 76 67 77
+      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+      // 00 10 20 30 01 11 21 31
+      // 40 50 60 70 41 51 61 71
+      // 02 12 22 32 03 13 23 33
+      // 42 52 62 72 43 53 63 73
+      // 04 14 24 34 05 15 21 36
+      // 44 54 64 74 45 55 61 76
+      // 06 16 26 36 07 17 27 37
+      // 46 56 66 76 47 57 67 77
+      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+      // 00 10 20 30 40 50 60 70
+      // 01 11 21 31 41 51 61 71
+      // 02 12 22 32 42 52 62 72
+      // 03 13 23 33 43 53 63 73
+      // 04 14 24 34 44 54 64 74
+      // 05 15 25 35 45 55 65 75
+      // 06 16 26 36 46 56 66 76
+      // 07 17 27 37 47 57 67 77
+    }
+  }
+  // Post-condition output and store it
+  {
+    // Post-condition (division by two)
+    //    division of two 16 bits signed numbers using shifts
+    //    n / 2 = (n - (n >> 15)) >> 1
+    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+    in0 = _mm_sub_epi16(in0, sign_in0);
+    in1 = _mm_sub_epi16(in1, sign_in1);
+    in2 = _mm_sub_epi16(in2, sign_in2);
+    in3 = _mm_sub_epi16(in3, sign_in3);
+    in4 = _mm_sub_epi16(in4, sign_in4);
+    in5 = _mm_sub_epi16(in5, sign_in5);
+    in6 = _mm_sub_epi16(in6, sign_in6);
+    in7 = _mm_sub_epi16(in7, sign_in7);
+    in0 = _mm_srai_epi16(in0, 1);
+    in1 = _mm_srai_epi16(in1, 1);
+    in2 = _mm_srai_epi16(in2, 1);
+    in3 = _mm_srai_epi16(in3, 1);
+    in4 = _mm_srai_epi16(in4, 1);
+    in5 = _mm_srai_epi16(in5, 1);
+    in6 = _mm_srai_epi16(in6, 1);
+    in7 = _mm_srai_epi16(in7, 1);
+    // store results
+    store_output(&in0, (output + 0 * 8));
+    store_output(&in1, (output + 1 * 8));
+    store_output(&in2, (output + 2 * 8));
+    store_output(&in3, (output + 3 * 8));
+    store_output(&in4, (output + 4 * 8));
+    store_output(&in5, (output + 5 * 8));
+    store_output(&in6, (output + 6 * 8));
+    store_output(&in7, (output + 7 * 8));
+  }
+}
+#endif  // CONFIG_INTERNAL_STATS
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..0e4fb80468
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
+#undef FDCT8x8_2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#undef DCT_HIGH_BIT_DEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
+#undef FDCT8x8_2D
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..78ea98522e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+  __m128i buf0, buf1;
+  buf0 = _mm_mul_epu32(a, b);
+  a = _mm_srli_epi64(a, 32);
+  b = _mm_srli_epi64(b, 32);
+  buf1 = _mm_mul_epu32(a, b);
+  return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+  __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+  __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+  return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+                                          const __m128i *preg1) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  cmp0 = _mm_or_si128(cmp0, cmp1);
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+                                          const __m128i *preg1,
+                                          const __m128i *preg2,
+                                          const __m128i *preg3) {
+  const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+  const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+  __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+                              _mm_cmpeq_epi16(*preg0, min_overflow));
+  __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+                              _mm_cmpeq_epi16(*preg1, min_overflow));
+  __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+                              _mm_cmpeq_epi16(*preg2, min_overflow));
+  __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+                              _mm_cmpeq_epi16(*preg3, min_overflow));
+  cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+  return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+  }
+  return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+    const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+    const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+    const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+    const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+    const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+    const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+    const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+    const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+    const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+    const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+    const __m128i *preg30, const __m128i *preg31) {
+  int res0, res1;
+  res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+  res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+  if (!res0) {
+    res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+    if (!res1) {
+      res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+      if (!res0) {
+        res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+        if (!res1) {
+          res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+          if (!res0) {
+            res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+            if (!res1)
+              res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+          }
+        }
+      }
+    }
+  }
+  return res0 + res1;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_store_si128((__m128i *)(dst_ptr), out0);
+  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..06879040b0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,379 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192:    times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
+pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585,  11585
+TRANSFORM_COEFFS 15137,   6270
+TRANSFORM_COEFFS 16069,   3196
+TRANSFORM_COEFFS  9102,  13623
+
+%macro STORE_OUTPUT 2 ; index, result
+  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  pxor               m11, m11
+  pcmpgtw            m11, m%2
+  movdqa             m12, m%2
+  punpcklwd          m%2, m11
+  punpckhwd          m12, m11
+  mova               [outputq + 4*%1 +  0], m%2
+  mova               [outputq + 4*%1 + 16], m12
+%endmacro
+
+SECTION .text
+
+%if AOM_ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+  mova               m8, [GLOBAL(pd_8192)]
+  mova              m12, [GLOBAL(pw_11585x2)]
+
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  ; left shift by 2 to increase forward transformation precision
+  psllw              m0, 2
+  psllw              m1, 2
+  psllw              m2, 2
+  psllw              m3, 2
+  psllw              m4, 2
+  psllw              m5, 2
+  psllw              m6, 2
+  psllw              m7, 2
+
+  ; column transform
+  ; stage 1
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  paddw m9, m1, m6
+  psubw m1, m6
+
+  paddw m7, m2, m5
+  psubw m2, m5
+
+  paddw m6, m3, m4
+  psubw m3, m4
+
+  ; stage 2
+  paddw m5, m9, m7
+  psubw m9, m7
+
+  paddw m4, m10, m6
+  psubw m10, m6
+
+  paddw m7, m1, m2
+  psubw m1, m2
+
+  ; stage 3
+  paddw m6, m4, m5
+  psubw m4, m5
+
+  pmulhrsw m1, m12
+  pmulhrsw m7, m12
+
+  ; sin(pi / 8), cos(pi / 8)
+  punpcklwd m2, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+  paddd m5, m8
+  paddd m2, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m2, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m5, m9
+  packssdw m2, m10
+
+  pmulhrsw m6, m12
+  pmulhrsw m4, m12
+
+  paddw m9, m3, m1
+  psubw m3, m1
+
+  paddw m10, m0, m7
+  psubw m0, m7
+
+  ; stage 4
+  ; sin(pi / 16), cos(pi / 16)
+  punpcklwd m1, m10, m9
+  punpckhwd m10, m9
+  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m1, m8
+  paddd m9, m8
+  paddd m10, m8
+  psrad m7, 14
+  psrad m1, 14
+  psrad m9, 14
+  psrad m10, 14
+  packssdw m7, m9
+  packssdw m1, m10
+
+  ; sin(3 * pi / 16), cos(3 * pi / 16)
+  punpcklwd m11, m0, m3
+  punpckhwd m0, m3
+  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+  paddd m9, m8
+  paddd m11, m8
+  paddd m3, m8
+  paddd m0, m8
+  psrad m9, 14
+  psrad m11, 14
+  psrad m3, 14
+  psrad m0, 14
+  packssdw m9, m3
+  packssdw m11, m0
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m6, m7
+  punpcklwd m3, m5, m11
+  punpckhwd m6, m7
+  punpckhwd m5, m11
+  punpcklwd m7, m4, m9
+  punpcklwd m10, m2, m1
+  punpckhwd m4, m9
+  punpckhwd m2, m1
+
+  ; stage 2
+  punpckldq m9, m0, m3
+  punpckldq m1, m6, m5
+  punpckhdq m0, m3
+  punpckhdq m6, m5
+  punpckldq m3, m7, m10
+  punpckldq m5, m4, m2
+  punpckhdq m7, m10
+  punpckhdq m4, m2
+
+  ; stage 3
+  punpcklqdq m10, m9, m3
+  punpckhqdq m9, m3
+  punpcklqdq m2, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m3, m1, m5
+  punpckhqdq m1, m5
+  punpcklqdq m7, m6, m4
+  punpckhqdq m6, m4
+
+  ; row transform
+  ; stage 1
+  paddw m5, m10, m6
+  psubw m10, m6
+
+  paddw m4, m9, m7
+  psubw m9, m7
+
+  paddw m6, m2, m1
+  psubw m2, m1
+
+  paddw m7, m0, m3
+  psubw m0, m3
+
+  ;stage 2
+  paddw m1, m5, m7
+  psubw m5, m7
+
+  paddw m3, m4, m6
+  psubw m4, m6
+
+  paddw m7, m9, m2
+  psubw m9, m2
+
+  ; stage 3
+  punpcklwd m6, m1, m3
+  punpckhwd m1, m3
+  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+  paddd m2, m8
+  paddd m6, m8
+  paddd m3, m8
+  paddd m1, m8
+  psrad m2, 14
+  psrad m6, 14
+  psrad m3, 14
+  psrad m1, 14
+  packssdw m2, m3
+  packssdw m6, m1
+
+  pmulhrsw m7, m12
+  pmulhrsw m9, m12
+
+  punpcklwd m3, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+  paddd m1, m8
+  paddd m3, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m1, 14
+  psrad m3, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m1, m4
+  packssdw m3, m5
+
+  paddw m4, m0, m9
+  psubw m0, m9
+
+  paddw m5, m10, m7
+  psubw m10, m7
+
+  ; stage 4
+  punpcklwd m9, m5, m4
+  punpckhwd m5, m4
+  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+  paddd m7, m8
+  paddd m9, m8
+  paddd m4, m8
+  paddd m5, m8
+  psrad m7, 14
+  psrad m9, 14
+  psrad m4, 14
+  psrad m5, 14
+  packssdw m7, m4
+  packssdw m9, m5
+
+  punpcklwd m4, m10, m0
+  punpckhwd m10, m0
+  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+  paddd m5, m8
+  paddd m4, m8
+  paddd m0, m8
+  paddd m10, m8
+  psrad m5, 14
+  psrad m4, 14
+  psrad m0, 14
+  psrad m10, 14
+  packssdw m5, m0
+  packssdw m4, m10
+
+  ; transpose
+  ; stage 1
+  punpcklwd m0, m2, m7
+  punpcklwd m10, m1, m4
+  punpckhwd m2, m7
+  punpckhwd m1, m4
+  punpcklwd m7, m6, m5
+  punpcklwd m4, m3, m9
+  punpckhwd m6, m5
+  punpckhwd m3, m9
+
+  ; stage 2
+  punpckldq m5, m0, m10
+  punpckldq m9, m2, m1
+  punpckhdq m0, m10
+  punpckhdq m2, m1
+  punpckldq m10, m7, m4
+  punpckldq m1, m6, m3
+  punpckhdq m7, m4
+  punpckhdq m6, m3
+
+  ; stage 3
+  punpcklqdq m4, m5, m10
+  punpckhqdq m5, m10
+  punpcklqdq m3, m0, m7
+  punpckhqdq m0, m7
+  punpcklqdq m10, m9, m1
+  punpckhqdq m9, m1
+  punpcklqdq m7, m2, m6
+  punpckhqdq m2, m6
+
+  psraw m1, m4, 15
+  psraw m6, m5, 15
+  psraw m8, m3, 15
+  psraw m11, m0, 15
+
+  psubw m4, m1
+  psubw m5, m6
+  psubw m3, m8
+  psubw m0, m11
+
+  psraw m4, 1
+  psraw m5, 1
+  psraw m3, 1
+  psraw m0, 1
+
+  psraw m1, m10, 15
+  psraw m6, m9, 15
+  psraw m8, m7, 15
+  psraw m11, m2, 15
+
+  psubw m10, m1
+  psubw m9, m6
+  psubw m7, m8
+  psubw m2, m11
+
+  psraw m10, 1
+  psraw m9, 1
+  psraw m7, 1
+  psraw m2, 1
+
+  STORE_OUTPUT  0,  4
+  STORE_OUTPUT  8,  5
+  STORE_OUTPUT 16,  3
+  STORE_OUTPUT 24,  0
+  STORE_OUTPUT 32, 10
+  STORE_OUTPUT 40,  9
+  STORE_OUTPUT 48,  7
+  STORE_OUTPUT 56,  2
+
+  RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..05c87bcff9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void highbd_load_b_values_avx2(
+    const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+    __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+    const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+    __m256i *shift) {
+  *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+  *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  *dequant =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i temp_mask = _mm256_setzero_si256();
+  if (_mm256_movemask_epi8(*cmp_mask)) {
+    __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+    temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+    *is_found = 1;
+  }
+  *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+                                            __m256i *threshold,
+                                            const int16_t *iscan_ptr,
+                                            int *is_found, __m256i *mask) {
+  __m256i coeff[2], cmp_mask0, cmp_mask1;
+  coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+  cmp_mask0 =
+      _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+                                         __m256i *p, const int shift) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, shift);
+  prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+                                                const __m256i *round,
+                                                const __m256i *quant,
+                                                const __m256i *shift,
+                                                const int *log_scale) {
+  __m256i tmp, qcoeff;
+  qcoeff = _mm256_add_epi32(*coeff, *round);
+  highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm256_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+                                                    __m256i dequant) {
+  return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+    __m256i qcoeff, __m256i dequant, const int log_scale) {
+  __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+  highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+                                                  __m256i coeff1,
+                                                  tran_low_t *coeff_ptr) {
+  _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  int prescan_add[2];
+  int thresh[2];
+  const int log_scale = 0;
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+                            &quant, dequant_ptr, &dequant, quant_shift_ptr,
+                            &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_unpackhi_epi64(zbin, zbin);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_unpackhi_epi64(round, round);
+    quant = _mm256_unpackhi_epi64(quant, quant);
+    shift = _mm256_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    dequant = _mm256_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+    coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 16;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const int log_scale = 1;
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i zbin, round, quant, dequant, shift;
+  __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+  __m256i cmp_mask, mask0 = zero, mask1 = zero;
+  __m128i temp_mask0, temp_mask1;
+  const __m256i one = _mm256_set1_epi32(1);
+  const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+  int prescan_add[2];
+  int thresh[2];
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  __m256i threshold[2];
+  threshold[0] = _mm256_set1_epi32(thresh[0]);
+  threshold[1] = _mm256_set1_epi32(thresh[1]);
+  threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+
+  // Setup global values.
+  zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+  round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+  quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+  dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+  shift =
+      _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+  // Shift with rounding.
+  zbin = _mm256_add_epi32(zbin, log_scale_vec);
+  round = _mm256_add_epi32(round, log_scale_vec);
+  zbin = _mm256_srli_epi32(zbin, log_scale);
+  round = _mm256_srli_epi32(round, log_scale);
+  zbin = _mm256_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+  qcoeff0 = _mm256_abs_epi32(coeff0);
+  coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  qcoeff1 = _mm256_abs_epi32(coeff1);
+  highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+                           &mask0);
+  __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+  __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+  highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+  threshold[0] = threshold[1];
+  if (_mm256_movemask_epi8(cmp_mask) == 0) {
+    _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+  } else {
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    round = _mm256_permute2x128_si256(round, round, 0x11);
+    quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+    shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    // Reinsert signs
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+    qcoeff0 = _mm256_abs_epi32(coeff0);
+    coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+    qcoeff1 = _mm256_abs_epi32(coeff1);
+    highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                             &is_found0, &mask0);
+    temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+    temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+    highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+    if (_mm256_movemask_epi8(cmp_mask) == 0) {
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+      _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+      index += 16;
+      continue;
+    }
+    highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+    qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+    qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+    qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+    qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+    highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+    coeff0 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+    coeff1 =
+        highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+    highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+    index += 16;
+  }
+  if (is_found0) {
+    temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+                               _mm256_extracti128_si256(mask0, 1));
+    non_zero_count = calculate_non_zero_count(temp_mask0);
+  }
+  if (is_found1) {
+    temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+                               _mm256_extracti128_si256(mask1, 1));
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+  }
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..ae31116e9d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+                                         __m128i *p, const int shift) {
+  __m128i sign = _mm_srai_epi32(*y, 31);
+  __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+  __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+  __m128i abs_y = invert_sign_32_sse2(*y, sign);
+  __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+  __m128i prod_hi = _mm_srli_epi64(*x, 32);
+  const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+  prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+  prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+  prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+  prod_lo = _mm_srli_epi64(prod_lo, shift);
+  const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+  prod_lo = _mm_and_si128(prod_lo, mask);
+  prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+  prod_hi = _mm_slli_epi64(prod_hi, 32);
+  *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+                                           const __m128i *quant,
+                                           const __m128i *shift,
+                                           const int *log_scale) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_add_epi32(*coeff, *round);
+  highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+  qcoeff = _mm_add_epi32(tmp, qcoeff);
+  highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i temp_mask = _mm_setzero_si128();
+  if (_mm_movemask_epi8(*cmp_mask0)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    temp_mask = mask0;
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                       __m128i *threshold,
+                                       const int16_t *iscan_ptr, int *is_found,
+                                       __m128i *mask) {
+  __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+  coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+  highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+                                               const int log_scale) {
+  __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+  __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+  highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+  return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 0;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff <
+          (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 1;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  int index = 8;
+  const int log_scale = 2;
+  int non_zero_count = 0;
+  int non_zero_count_prescan_add_zero = 0;
+  int is_found0 = 0, is_found1 = 0;
+  int eob = -1;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi32(1);
+  const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, cmp_mask;
+  __m128i all_zero;
+  __m128i mask0 = zero, mask1 = zero;
+
+  const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+                         ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+  int prescan_add[2];
+  int thresh[4];
+  const qm_val_t wt = (1 << AOM_QM_BITS);
+  for (int i = 0; i < 2; ++i) {
+    prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+    thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+  }
+  thresh[2] = thresh[3] = thresh[1];
+  __m128i threshold[2];
+  threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+  threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+  int first = -1;
+#endif
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+  __m128i round_sign = _mm_srai_epi16(round, 15);
+  __m128i quant_sign = _mm_srai_epi16(quant, 15);
+  __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+  __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+  zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+  round = _mm_unpacklo_epi16(round, round_sign);
+  quant = _mm_unpacklo_epi16(quant, quant_sign);
+  dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+  shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi32(zbin, log_scale_vec);
+  round = _mm_add_epi32(round, log_scale_vec);
+  zbin = _mm_srli_epi32(zbin, log_scale);
+  round = _mm_srli_epi32(round, log_scale);
+  zbin = _mm_sub_epi32(zbin, one);
+
+  // Do DC and first 15 AC.
+  coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+  coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+  coeff0_sign = _mm_srai_epi32(coeff0, 31);
+  coeff1_sign = _mm_srai_epi32(coeff1, 31);
+  qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+  highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+  cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+  cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+  threshold[0] = threshold[1];
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    // Reinsert signs
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    // Mask out zbin threshold coeffs
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+  }
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+    coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+    coeff0_sign = _mm_srai_epi32(coeff0, 31);
+    coeff1_sign = _mm_srai_epi32(coeff1, 31);
+    qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+    highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+                        &is_found0, &mask0);
+
+    cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+    cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+    highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      index += 8;
+      continue;
+    }
+    highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+    highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+    qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+    coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+    coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+    index += 8;
+  }
+  if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+  if (is_found1)
+    non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+  for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+    const int rc = scan[i];
+    qcoeff_ptr[rc] = 0;
+    dqcoeff_ptr[rc] = 0;
+  }
+
+  for (int i = non_zero_count - 1; i >= 0; i--) {
+    const int rc = scan[i];
+    if (qcoeff_ptr[rc]) {
+      eob = i;
+      break;
+    }
+  }
+
+  *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+  // TODO(Aniket): Experiment the following loop with intrinsic by combining
+  // with the quantization loop above
+  for (int i = 0; i < non_zero_count; i++) {
+    const int rc = scan[i];
+    const int qcoeff = qcoeff_ptr[rc];
+    if (qcoeff) {
+      first = i;
+      break;
+    }
+  }
+  if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+    const int rc = scan[(*eob_ptr - 1)];
+    if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+      const int coeff = coeff_ptr[rc] * wt;
+      const int coeff_sign = AOMSIGN(coeff);
+      const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+      const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+      const int prescan_add_val =
+          ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+      if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+        qcoeff_ptr[rc] = 0;
+        dqcoeff_ptr[rc] = 0;
+        *eob_ptr = 0;
+      }
+    }
+  }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..11e45778c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                             7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                             4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13,
+                                             4, 5, 6,  7,  6,  7,  8,  9,
+                                             8, 9, 10, 11, 10, 11, 12, 13 };
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const int subpel_x_qn,
+                                    ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_y_qn, int bd);
+
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_y_qn, int bd) {
+  if (filter_params_y->taps == 12) {
+    av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn, bd);
+    return;
+  }
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+
+  __m256i s[8], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m256i src6;
+      __m256i s01 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          0x20);
+      __m256i s12 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          0x20);
+      __m256i s23 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          0x20);
+      __m256i s34 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          0x20);
+      __m256i s45 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          0x20);
+      src6 = _mm256_castsi128_si256(
+          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+      __m256i s56 = _mm256_permute2x128_si256(
+          _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+          src6, 0x20);
+
+      s[0] = _mm256_unpacklo_epi16(s01, s12);
+      s[1] = _mm256_unpacklo_epi16(s23, s34);
+      s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+      s[4] = _mm256_unpackhi_epi16(s01, s12);
+      s[5] = _mm256_unpackhi_epi16(s23, s34);
+      s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        const __m256i s67 = _mm256_permute2x128_si256(
+            src6,
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+        const __m256i s78 = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(
+                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+            src6, 0x20);
+
+        s[3] = _mm256_unpacklo_epi16(s67, s78);
+        s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+        const __m256i res_a = convolve(s, coeffs_y);
+
+        __m256i res_a_round = _mm256_sra_epi32(
+            _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m256i res_b = convolve(s + 4, coeffs_y);
+          __m256i res_b_round = _mm256_sra_epi32(
+              _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+          __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+          res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+          res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_16bit));
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_16bit, 1));
+        } else if (w == 4) {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                           _mm256_castsi256_si128(res_a_round));
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           _mm256_extracti128_si256(res_a_round, 1));
+        } else {
+          res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+          res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+          res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+          xx_storel_32(&dst[i * dst_stride + j],
+                       _mm256_castsi256_si128(res_a_round));
+          xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+                       _mm256_extracti128_si256(res_a_round, 1));
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+                                   uint16_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const int subpel_x_qn,
+                                   ConvolveParams *conv_params, int bd) {
+  if (filter_params_x->taps == 12) {
+    av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_x, subpel_x_qn, conv_params,
+                                   bd);
+    return;
+  }
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m256i s[4], coeffs_x[4];
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  assert(bits >= 0);
+  assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+         ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+  prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+                                  round_shift_bits);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+                                 round_shift_bits);
+
+      __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+      __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+      __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+      res = _mm256_min_epi16(res, clip_pixel);
+      res = _mm256_max_epi16(res, zero);
+
+      if (w - j > 4) {
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else if (w == 4) {
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+                         _mm256_castsi256_si128(res));
+        _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                         _mm256_extracti128_si256(res, 1));
+      } else {
+        xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
+        xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+                     _mm256_extracti128_si256(res, 1));
+      }
+    }
+  }
+}
+
+#define CONV8_ROUNDING_BITS (7)
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                              7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                              4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13,
+                                              4, 5, 6,  7,  6,  7,  8,  9,
+                                              8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15,
+                                              6,  7,  8,  9,  8,  9,  10, 11,
+                                              10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+  const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+  const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+  p[0] = _mm256_shuffle_epi8(*s, sf0);  // x0x6
+  p[1] = _mm256_shuffle_epi8(*s, sf1);  // x1x7
+  p[2] = _mm256_shuffle_epi8(c, sf0);   // x2x4
+  p[3] = _mm256_shuffle_epi8(c, sf1);   // x3x5
+}
+
+// Note:
+//  Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+                                  __m256i *x /*x[8]*/) {
+  __m256i pp[8];
+  pack_pixels(s0, pp);
+  pack_pixels(s1, &pp[4]);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+  x[4] = x[2];
+  x[5] = x[3];
+  x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+  x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i pp[8];
+  __m256i s0;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  pack_pixels(&s0, pp);
+  x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+  x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+  x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+  x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+                                   __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+  __m256i s0, s1;
+  s0 = _mm256_loadu_si256((const __m256i *)src);
+  s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+//  Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p0 = _mm256_set1_epi32(0x03020100);
+  const __m256i p1 = _mm256_set1_epi32(0x07060504);
+  const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+  const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+  f[0] = _mm256_shuffle_epi8(hh, p0);
+  f[1] = _mm256_shuffle_epi8(hh, p1);
+  f[2] = _mm256_shuffle_epi8(hh, p2);
+  f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void pack_filters_4tap(const int16_t *filter,
+                                     __m256i *f /*f[4]*/) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i coeff = _mm256_broadcastsi128_si256(h);
+
+  // coeffs 2 3 2 3 2 3 2 3
+  f[0] = _mm256_shuffle_epi32(coeff, 0x55);
+  // coeffs 4 5 4 5 4 5 4 5
+  f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+                                     const __m256i *fil /*fil[4]*/,
+                                     __m256i *y) {
+  __m256i a, a0, a1;
+
+  a0 = _mm256_madd_epi16(fil[0], sig[0]);
+  a1 = _mm256_madd_epi16(fil[3], sig[3]);
+  a = _mm256_add_epi32(a0, a1);
+
+  a0 = _mm256_madd_epi16(fil[1], sig[1]);
+  a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+  {
+    const __m256i min = _mm256_min_epi32(a0, a1);
+    a = _mm256_add_epi32(a, min);
+  }
+  {
+    const __m256i max = _mm256_max_epi32(a0, a1);
+    a = _mm256_add_epi32(a, max);
+  }
+  {
+    const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+    a = _mm256_add_epi32(a, rounding);
+    *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+  }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+                                    uint16_t *dst) {
+  const __m128i a0 = _mm256_castsi256_si128(*y);
+  const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+  __m128i res = _mm_packus_epi32(a0, a1);
+  res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+                                    const __m256i *mask, uint16_t *dst,
+                                    ptrdiff_t pitch) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+  _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst) {
+  __m256i a = _mm256_packus_epi32(*y0, *y1);
+  a = _mm256_min_epi16(a, *mask);
+  _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_pixels(src_ptr, src_pitch, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[8], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_pixels(src_ptr, signal);
+    filter_8x1_pixels(signal, ff, &res0);
+    filter_8x1_pixels(&signal[4], ff, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+                                            7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+                                            4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+  for (i = 0; i <= (height - 2); i += 2) {
+    __m256i row0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    __m256i row1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
+
+    s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
+
+    s[0] = _mm256_shuffle_epi8(s[0], mask);
+    s[1] = _mm256_shuffle_epi8(s[1], mask);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+    const __m256i row0_1 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+  }
+}
+
+static void aom_highbd_filter_block1d8_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i ff[2], s[2];
+  uint32_t i = 0;
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+
+  static const uint8_t shuffle_mask[32] = { 0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15,
+                                            0, 1, 8,  9,  2, 3, 10, 11,
+                                            4, 5, 12, 13, 6, 7, 14, 15 };
+
+  __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+  __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+  __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+  pack_filters_4tap(filter, ff);
+  src_ptr -= 3;
+
+  /* Horizontal filter */
+
+  for (i = 0; i <= (height - 2); i += 2) {
+    const __m256i row0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    __m256i row1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+    const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+    // even pixels
+    s[0] = r0;
+    s[1] = _mm256_alignr_epi8(r1, r0, 4);
+
+    __m256i res_even = convolve_4tap(s, ff);
+    res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
+                                 CONV8_ROUNDING_BITS);
+
+    // odd pixels
+    s[0] = _mm256_alignr_epi8(r1, r0, 2);
+    s[1] = _mm256_alignr_epi8(r1, r0, 6);
+
+    __m256i res_odd = convolve_4tap(s, ff);
+    res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
+                                CONV8_ROUNDING_BITS);
+
+    __m256i res = _mm256_packs_epi32(res_even, res_odd);
+    res = _mm256_shuffle_epi8(res, mask);
+
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                     _mm256_extracti128_si256(res, 1));
+  }
+
+  if (height % 2 != 0) {
+    i = height - 1;
+    const __m256i row0_0 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+    const __m256i row0_1 =
+        _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
+
+    const __m256i r0 =
+        _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+    s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+    s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+    __m256i res = convolve_4tap(s, ff);
+    res =
+        _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+    res = _mm256_packs_epi32(res, res);
+    res = _mm256_min_epi16(res, clip_pixel);
+    res = _mm256_max_epi16(res, zero);
+
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                     _mm256_castsi256_si128(res));
+    _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
+                     _mm256_extracti128_si256(res, 1));
+  }
+}
+
+static void aom_highbd_filter_block1d16_h4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+  const __m256i p = _mm256_set1_epi32(0x09080706);
+  f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+                                     __m256i *sig) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+  __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+  __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+  __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  r1 = _mm256_shuffle_epi8(r1, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+  sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+                                      const ptrdiff_t pitch, __m256i *sig) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+                                       __m256i *sig /*sig[2]*/) {
+  const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+  pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+                                      __m256i *sig /*sig[2]*/) {
+  const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+  const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+  __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+  __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+  r0 = _mm256_permutevar8x32_epi32(r0, idx);
+  r0 = _mm256_shuffle_epi8(r0, sf2);
+  sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+                                       __m256i *y0, __m256i *y1) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  x1 = _mm256_add_epi32(x1, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+                                        __m256i *y0) {
+  const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+  x0 = _mm256_add_epi32(x0, rounding);
+  *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    height -= 2;
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+  } while (height > 1);
+
+  if (height > 0) {
+    pack_8x1_2t_pixels(src_ptr, signal);
+    filter_8x1_2t_pixels(signal, &ff, &res0);
+    store_8x1_pixels(&res0, &max, dst_ptr);
+  }
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[2], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff;
+  pack_2t_filter(filter, &ff);
+
+  src_ptr -= 3;
+  do {
+    pack_16x1_2t_pixels(src_ptr, signal);
+    filter_16_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+    height -= 1;
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+  } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+  __m256i s1 =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+  __m256i s2 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+  __m256i s3 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+  __m256i s4 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+  __m256i s5 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+  __m256i s6 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+  s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+  s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+  s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+  s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+  s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+  sig[0] = _mm256_unpacklo_epi16(s0, s1);
+  sig[4] = _mm256_unpackhi_epi16(s0, s1);
+  sig[1] = _mm256_unpacklo_epi16(s2, s3);
+  sig[5] = _mm256_unpackhi_epi16(s2, s3);
+  sig[2] = _mm256_unpacklo_epi16(s4, s5);
+  sig[6] = _mm256_unpackhi_epi16(s4, s5);
+  sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                   __m256i *sig) {
+  // base + 7th row
+  __m256i s0 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+  // base + 8th row
+  __m256i s1 = _mm256_castsi128_si256(
+      _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+  __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+  __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+  sig[3] = _mm256_unpacklo_epi16(s2, s3);
+  sig[7] = _mm256_unpackhi_epi16(s2, s3);
+  sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+                                     __m256i *y0, __m256i *y1) {
+  filter_8x1_pixels(sig, f, y0);
+  filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+  int i;
+  for (i = 0; i < 3; ++i) {
+    sig[i] = sig[i + 1];
+    sig[i + 4] = sig[i + 5];
+  }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[9], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_8x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+    filter_8x9_pixels(signal, ff, &res0, &res1);
+    store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+  __m256i u0, u1, u2, u3;
+  // load 0-6 rows
+  const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+  const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+  const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+  const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+  const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+  const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+  u0 = _mm256_permute2x128_si256(s0, s1, 0x20);  // 0, 1 low
+  u1 = _mm256_permute2x128_si256(s0, s1, 0x31);  // 0, 1 high
+
+  u2 = _mm256_permute2x128_si256(s1, s2, 0x20);  // 1, 2 low
+  u3 = _mm256_permute2x128_si256(s1, s2, 0x31);  // 1, 2 high
+
+  sig[0] = _mm256_unpacklo_epi16(u0, u2);
+  sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[8] = _mm256_unpacklo_epi16(u1, u3);
+  sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+  u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+  u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+  sig[1] = _mm256_unpacklo_epi16(u0, u2);
+  sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[9] = _mm256_unpacklo_epi16(u1, u3);
+  sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+  u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+  u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+  u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+  sig[2] = _mm256_unpacklo_epi16(u0, u2);
+  sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[10] = _mm256_unpacklo_epi16(u1, u3);
+  sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+                             __m256i *sig) {
+  // base + 7th row
+  const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+  // base + 8th row
+  const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+  __m256i u0, u1, u2, u3;
+  u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+  u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+  u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+  u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+  sig[3] = _mm256_unpacklo_epi16(u0, u2);
+  sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+  sig[11] = _mm256_unpacklo_epi16(u1, u3);
+  sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+  sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+                                      __m256i *y0, __m256i *y1) {
+  __m256i res[4];
+  int i;
+  for (i = 0; i < 4; ++i) {
+    filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+  }
+
+  {
+    const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+    const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+    *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+    *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+  }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+                                     const __m256i *mask, uint16_t *dst,
+                                     ptrdiff_t pitch) {
+  __m256i p = _mm256_min_epi16(*y0, *mask);
+  _mm256_storeu_si256((__m256i *)dst, p);
+  p = _mm256_min_epi16(*y1, *mask);
+  _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+  update_pixels(&sig[0]);
+  update_pixels(&sig[8]);
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[17], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+  __m256i ff[4];
+  pack_filters(filter, ff);
+
+  pack_16x9_init(src_ptr, src_pitch, signal);
+
+  do {
+    pack_16x9_pixels(src_ptr, src_pitch, signal);
+    filter_16x9_pixels(signal, ff, &res0, &res1);
+    store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+    update_16x9_pixels(signal);
+
+    src_ptr += src_pitch << 1;
+    dst_ptr += dst_pitch << 1;
+    height -= 2;
+  } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  uint32_t i;
+  __m256i s[2], ff[2];
+
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
+      res_16bit = _mm256_max_epi32(res_16bit, zero);
+      res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
+
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d8_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+  const __m256i clip_pixel =
+      _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i s[4], ff[2];
+  uint32_t i;
+  pack_filters_4tap(filter, ff);
+
+  const uint16_t *data = src_ptr;
+  /* Vertical filter */
+  {
+    __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
+    __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
+
+    __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+    __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
+
+    __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+    s[0] = _mm256_unpacklo_epi16(s23, s34);
+    s[2] = _mm256_unpackhi_epi16(s23, s34);
+
+    for (i = 0; i < height; i += 2) {
+      data = &src_ptr[i * src_pitch];
+
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
+
+      __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+      __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+      s[1] = _mm256_unpacklo_epi16(s45, s56);
+      s[3] = _mm256_unpackhi_epi16(s45, s56);
+
+      const __m256i res_a = convolve_4tap(s, ff);
+
+      __m256i res_a_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+      const __m256i res_b = convolve_4tap(s + 2, ff);
+      __m256i res_b_round = _mm256_sra_epi32(
+          _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+      __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+      res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+      res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+                       _mm256_castsi256_si128(res_16bit));
+      _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+                       _mm256_extracti128_si256(res_16bit, 1));
+
+      s[0] = s[1];
+      s[2] = s[3];
+      s4 = s6;
+    }
+  }
+}
+
+static void aom_highbd_filter_block1d16_v4_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+
+  aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+                                     dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+  sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+                                       __m256i *sig) {
+  // load the next row
+  const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+  sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+                                         __m256i *y0, __m256i *y1) {
+  filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m256i signal[3], res0, res1;
+  const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+  __m256i ff;
+
+  pack_2t_filter(filter, &ff);
+  pack_16x2_init(src_ptr, signal);
+
+  do {
+    pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+    filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+    store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+  const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i p = _mm_set1_epi32(0x09080706);
+  f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+  sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+                                          __m128i *sig) {
+  // load the next row
+  const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+  sig[0] = _mm_unpacklo_epi16(sig[2], u);
+  sig[1] = _mm_unpackhi_epi16(sig[2], u);
+  sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+                                      __m128i *y0, __m128i *y1) {
+  const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+  __m128i x0 = _mm_madd_epi16(sig[0], *f);
+  __m128i x1 = _mm_madd_epi16(sig[1], *f);
+  x0 = _mm_add_epi32(x0, rounding);
+  x1 = _mm_add_epi32(x1, rounding);
+  *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+  *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+                                           const __m128i *mask, uint16_t *dst) {
+  __m128i res = _mm_packus_epi32(*y0, *y1);
+  res = _mm_min_epi16(res, *mask);
+  _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+    const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+    ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+  __m128i signal[3], res0, res1;
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  __m128i ff;
+
+  pack_8x1_2t_filter(filter, &ff);
+  pack_8x2_init(src_ptr, signal);
+
+  do {
+    pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+    filter_8_2t_pixels(signal, &ff, &res0, &res1);
+    store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+    src_ptr += src_pitch;
+    dst_ptr += dst_pitch;
+    height -= 1;
+  } while (height > 0);
+}
+
+void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+                                        ptrdiff_t, uint32_t, const int16_t *,
+                                        int);
+#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
+#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
+#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
+#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 0000000000..a2bb283222
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg34_lo;
+  __m128i srcReg45_lo, srcReg56_lo;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_45_lo, resReg34_56_lo;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiply the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+    _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg34_lo = srcReg56_lo;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1;
+  __m128i srcReg32b1;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+    __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+    ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+    ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+    // shift by 7 bit each 32 bit
+    srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+    srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+  __m128i resReg23_45, resReg34_56;
+  __m128i addFilterReg64, secondFilters, thirdFilters;
+  unsigned int i;
+  ptrdiff_t src_stride, dst_stride;
+
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
+
+  // multiple the size of the source and destination stride by two
+  src_stride = src_pitch << 1;
+  dst_stride = dst_pitch << 1;
+
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+  srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+  srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+  for (i = height; i > 1; i -= 2) {
+    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+    srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+    srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+    srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+    srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+    resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+    resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+    resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+    resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+    resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+    // multiply 2 adjacent elements with the filter and add the result
+
+    resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+    resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+    resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+    resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+    resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+    resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+    // shift by 7 bit each 32 bit
+    resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+    resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+    resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+    resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+    resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+    resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+    resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+    resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+    // shrink to 16 bit each 32 bits, the first lane contain the first
+    // convolve result and the second lane contain the second convolve
+    // result
+    resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+    resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+    resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+    resReg23_45 = _mm_min_epi16(resReg23_45, max);
+    resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+    resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+    src_ptr += src_stride;
+
+    _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+    _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+    dst_ptr += dst_stride;
+
+    // save part of the registers for next strides
+    srcReg23_lo = srcReg45_lo;
+    srcReg23_hi = srcReg45_hi;
+    srcReg34_lo = srcReg56_lo;
+    srcReg34_hi = srcReg56_hi;
+    srcReg4 = srcReg6;
+  }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+                                        ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                        ptrdiff_t dst_pitch, uint32_t height,
+                                        const int16_t *filter, int bd) {
+  __m128i filtersReg;
+  __m128i addFilterReg64;
+  __m128i secondFilters, thirdFilters;
+  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+  __m128i srcReg32b1, srcReg32b2;
+  unsigned int i;
+  src_ptr -= 3;
+  addFilterReg64 = _mm_set1_epi32(64);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
+  const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+  // coeffs 0 1 0 1 2 3 2 3
+  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+  // coeffs 4 5 4 5 6 7 6 7
+  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
+  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
+
+  for (i = height; i > 0; i -= 1) {
+    srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+    __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+    __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+    __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+    __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+    __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+    __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+    __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+    __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+    __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+    __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+    d1 = _mm_madd_epi16(ss_3, secondFilters);
+    d2 = _mm_madd_epi16(ss_5, thirdFilters);
+    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+    __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+    __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+    // shift by 7 bit each 32 bit
+    res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+    res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+    res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+    res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+    srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+    srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+    src_ptr += src_pitch;
+
+    _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+    dst_ptr += dst_pitch;
+  }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+                                         ptrdiff_t src_pitch, uint16_t *dst_ptr,
+                                         ptrdiff_t dst_pitch, uint32_t height,
+                                         const int16_t *filter, int bd) {
+  aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+                                     height, filter, bd);
+  aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+                                     dst_pitch, height, filter, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 0000000000..31c3c31b3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_y,
+                                    const int subpel_y_qn, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+  if (filter_params_y->taps == 12) {
+    __m128i s[24], coeffs_y[6];
+
+    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+    for (j = 0; j < w; j += 8) {
+      const uint16_t *data = &src_ptr[j];
+      /* Vertical filter */
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+      __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+      __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+      __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride));
+      __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+      s[3] = _mm_unpacklo_epi16(s6, s7);
+      s[4] = _mm_unpacklo_epi16(s8, s9);
+
+      s[6] = _mm_unpackhi_epi16(s0, s1);
+      s[7] = _mm_unpackhi_epi16(s2, s3);
+      s[8] = _mm_unpackhi_epi16(s4, s5);
+      s[9] = _mm_unpackhi_epi16(s6, s7);
+      s[10] = _mm_unpackhi_epi16(s8, s9);
+
+      s[12] = _mm_unpacklo_epi16(s1, s2);
+      s[13] = _mm_unpacklo_epi16(s3, s4);
+      s[14] = _mm_unpacklo_epi16(s5, s6);
+      s[15] = _mm_unpacklo_epi16(s7, s8);
+      s[16] = _mm_unpacklo_epi16(s9, s10);
+
+      s[18] = _mm_unpackhi_epi16(s1, s2);
+      s[19] = _mm_unpackhi_epi16(s3, s4);
+      s[20] = _mm_unpackhi_epi16(s5, s6);
+      s[21] = _mm_unpackhi_epi16(s7, s8);
+      s[22] = _mm_unpackhi_epi16(s9, s10);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride));
+        __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride));
+
+        s[5] = _mm_unpacklo_epi16(s10, s11);
+        s[11] = _mm_unpackhi_epi16(s10, s11);
+
+        s[17] = _mm_unpacklo_epi16(s11, s12);
+        s[23] = _mm_unpackhi_epi16(s11, s12);
+
+        const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((int *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+        s[3] = s[4];
+        s[4] = s[5];
+
+        s[6] = s[7];
+        s[7] = s[8];
+        s[8] = s[9];
+        s[9] = s[10];
+        s[10] = s[11];
+
+        s[12] = s[13];
+        s[13] = s[14];
+        s[14] = s[15];
+        s[15] = s[16];
+        s[16] = s[17];
+
+        s[18] = s[19];
+        s[19] = s[20];
+        s[20] = s[21];
+        s[21] = s[22];
+        s[22] = s[23];
+
+        s10 = s12;
+      }
+    }
+  } else {
+    __m128i s[16], coeffs_y[4];
+
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+    for (j = 0; j < w; j += 8) {
+      const uint16_t *data = &src_ptr[j];
+      /* Vertical filter */
+      {
+        __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+        __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+        __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+        __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+        __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+        __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+        s[0] = _mm_unpacklo_epi16(s0, s1);
+        s[1] = _mm_unpacklo_epi16(s2, s3);
+        s[2] = _mm_unpacklo_epi16(s4, s5);
+
+        s[4] = _mm_unpackhi_epi16(s0, s1);
+        s[5] = _mm_unpackhi_epi16(s2, s3);
+        s[6] = _mm_unpackhi_epi16(s4, s5);
+
+        s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+        s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+        s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+        s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+        s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+        s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+        for (i = 0; i < h; i += 2) {
+          data = &src_ptr[i * src_stride + j];
+
+          __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+          __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+          s[3] = _mm_unpacklo_epi16(s6, s7);
+          s[7] = _mm_unpackhi_epi16(s6, s7);
+
+          s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+          s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+          const __m128i res_a0 = convolve(s, coeffs_y);
+          __m128i res_a_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+          const __m128i res_a1 = convolve(s + 8, coeffs_y);
+          __m128i res_a_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+          if (w - j > 4) {
+            const __m128i res_b0 = convolve(s + 4, coeffs_y);
+            __m128i res_b_round0 = _mm_sra_epi32(
+                _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+            const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+            __m128i res_b_round1 = _mm_sra_epi32(
+                _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+            __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+            res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+            res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+            __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+            res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+            res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_16bit1);
+          } else if (w == 4) {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_a_round1);
+          } else {
+            res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+            res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+            res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+            res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+            res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+            res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+            *((int *)(&dst[i * dst_stride + j])) =
+                _mm_cvtsi128_si32(res_a_round0);
+
+            *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+                _mm_cvtsi128_si32(res_a_round1);
+          }
+
+          s[0] = s[1];
+          s[1] = s[2];
+          s[2] = s[3];
+
+          s[4] = s[5];
+          s[5] = s[6];
+          s[6] = s[7];
+
+          s[0 + 8] = s[1 + 8];
+          s[1 + 8] = s[2 + 8];
+          s[2 + 8] = s[3 + 8];
+
+          s[4 + 8] = s[5 + 8];
+          s[5 + 8] = s[6 + 8];
+          s[6 + 8] = s[7 + 8];
+
+          s6 = s8;
+        }
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    const InterpFilterParams *filter_params_x,
+                                    const int subpel_x_qn,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  if (filter_params_x->taps == 12) {
+    __m128i s[6], coeffs_x[6];
+
+    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+          const __m128i row02 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+          s[4] = _mm_alignr_epi8(row02, row01, 0);
+          s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+          __m128i res_even = convolve_12tap(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                   round_shift_bits);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+          s[4] = _mm_alignr_epi8(row02, row01, 2);
+          s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+          __m128i res_odd = convolve_12tap(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                  round_shift_bits);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          res = _mm_min_epi16(res, clip_pixel);
+          res = _mm_max_epi16(res, zero);
+
+          if (w - j > 4) {
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+          } else if (w == 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+          } else {
+            *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+          }
+        }
+      }
+    }
+  } else {
+    __m128i s[4], coeffs_x[4];
+    prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+    for (j = 0; j < w; j += 8) {
+      /* Horizontal filter */
+      {
+        for (i = 0; i < h; i += 1) {
+          const __m128i row00 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+          const __m128i row01 =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+          // even pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 0);
+          s[1] = _mm_alignr_epi8(row01, row00, 4);
+          s[2] = _mm_alignr_epi8(row01, row00, 8);
+          s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+          __m128i res_even = convolve(s, coeffs_x);
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                   round_shift_x);
+
+          // odd pixels
+          s[0] = _mm_alignr_epi8(row01, row00, 2);
+          s[1] = _mm_alignr_epi8(row01, row00, 6);
+          s[2] = _mm_alignr_epi8(row01, row00, 10);
+          s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+          __m128i res_odd = convolve(s, coeffs_x);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+                                  round_shift_x);
+
+          res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                   round_shift_bits);
+          res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                  round_shift_bits);
+
+          __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+          __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+          __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+          res = _mm_min_epi16(res, clip_pixel);
+          res = _mm_max_epi16(res, zero);
+
+          if (w - j > 4) {
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+          } else if (w == 4) {
+            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+          } else {
+            *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..91b3d126ca
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  paddw                 m0, m2
+  pshuflw               m1, m0, 0xe
+  paddw                 m0, m1
+  pshuflw               m1, m0, 0x1
+  paddw                 m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, one
+  mov                 oned, 0x00010001
+  lea             stride3q, [strideq*3]
+  movd                  m3, oned
+  pshufd                m3, m3, 0x0
+  paddw                 m0, m2
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  packssdw              m0, m1
+  pmaddwd               m0, m3
+  paddw                 m0, [GLOBAL(pw_8)]
+  psrlw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m3, [aboveq+16]
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_16)]
+  psrad                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova   [dstq              ], m0
+  mova   [dstq           +16], m0
+  mova   [dstq+strideq*2    ], m0
+  mova   [dstq+strideq*2 +16], m0
+  mova   [dstq+strideq*4    ], m0
+  mova   [dstq+strideq*4 +16], m0
+  mova   [dstq+stride3q*2   ], m0
+  mova   [dstq+stride3q*2+16], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [aboveq+32]
+  mova                  m4, [aboveq+48]
+  paddw                 m0, m2
+  paddw                 m3, m4
+  mova                  m2, [leftq]
+  mova                  m4, [leftq+16]
+  mova                  m5, [leftq+32]
+  mova                  m6, [leftq+48]
+  paddw                 m2, m4
+  paddw                 m5, m6
+  paddw                 m0, m3
+  paddw                 m2, m5
+  pxor                  m1, m1
+  paddw                 m0, m2
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  movhlps               m2, m0
+  paddw                 m0, m2
+  punpcklwd             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  punpckldq             m0, m1
+  movhlps               m2, m0
+  paddd                 m0, m2
+  paddd                 m0, [GLOBAL(pw_32)]
+  psrad                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+.loop:
+  mova [dstq               ], m0
+  mova [dstq          +16  ], m0
+  mova [dstq          +32  ], m0
+  mova [dstq          +48  ], m0
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2+16  ], m0
+  mova [dstq+strideq*2+32  ], m0
+  mova [dstq+strideq*2+48  ], m0
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4+16  ], m0
+  mova [dstq+strideq*4+32  ], m0
+  mova [dstq+strideq*4+48  ], m0
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m0
+  mova [dstq+stride3q*2 +32], m0
+  mova [dstq+stride3q*2 +48], m0
+  lea                 dstq, [dstq+strideq*8]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  lea                 dstq, [dstq+strideq*8]
+  mova   [dstq           ], m0
+  mova   [dstq+strideq*2 ], m0
+  mova   [dstq+strideq*4 ], m0
+  mova   [dstq+stride3q*2], m0
+  RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq              ], m0
+  mova    [dstq           +16], m1
+  mova    [dstq+strideq*2    ], m0
+  mova    [dstq+strideq*2 +16], m1
+  mova    [dstq+strideq*4    ], m0
+  mova    [dstq+strideq*4 +16], m1
+  mova    [dstq+stride3q*2   ], m0
+  mova    [dstq+stride3q*2+16], m1
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  mova                  m2, [aboveq+32]
+  mova                  m3, [aboveq+48]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq               ], m0
+  mova [dstq            +16], m1
+  mova [dstq            +32], m2
+  mova [dstq            +48], m3
+  mova [dstq+strideq*2     ], m0
+  mova [dstq+strideq*2  +16], m1
+  mova [dstq+strideq*2  +32], m2
+  mova [dstq+strideq*2  +48], m3
+  mova [dstq+strideq*4     ], m0
+  mova [dstq+strideq*4  +16], m1
+  mova [dstq+strideq*4  +32], m2
+  mova [dstq+strideq*4  +48], m3
+  mova [dstq+stride3q*2    ], m0
+  mova [dstq+stride3q*2 +16], m1
+  mova [dstq+stride3q*2 +32], m2
+  mova [dstq+stride3q*2 +48], m3
+  lea                 dstq, [dstq+strideq*8]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 0000000000..6a2e915ed7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+  dst += stride << 2;
+  left += 4;
+  aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  (void)above;
+  (void)bd;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+  dst += stride << 3;
+  left += 8;
+  aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)*dst, val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_16_unpacklo(&dst, stride, &row0);
+  h_store_16_unpacklo(&dst, stride, &row1);
+  h_store_16_unpacklo(&dst, stride, &row2);
+  h_store_16_unpacklo(&dst, stride, &row3);
+  h_store_16_unpackhi(&dst, stride, &row4);
+  h_store_16_unpackhi(&dst, stride, &row5);
+  h_store_16_unpackhi(&dst, stride, &row6);
+  h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)above;
+  (void)bd;
+  h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_16x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpacklo_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+                                       const __m128i *row) {
+  const __m128i val = _mm_unpackhi_epi64(*row, *row);
+  _mm_store_si128((__m128i *)(*dst), val);
+  _mm_store_si128((__m128i *)(*dst + 8), val);
+  _mm_store_si128((__m128i *)(*dst + 16), val);
+  _mm_store_si128((__m128i *)(*dst + 24), val);
+  *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+                                    const uint16_t *left) {
+  const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+  const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+  const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+  const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+  const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+  const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+  const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+  const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+  const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+  h_store_32_unpacklo(&dst, stride, &row0);
+  h_store_32_unpacklo(&dst, stride, &row1);
+  h_store_32_unpacklo(&dst, stride, &row2);
+  h_store_32_unpacklo(&dst, stride, &row3);
+  h_store_32_unpackhi(&dst, stride, &row4);
+  h_store_32_unpackhi(&dst, stride, &row5);
+  h_store_32_unpackhi(&dst, stride, &row6);
+  h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 2; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  int i;
+  (void)above;
+  (void)bd;
+
+  for (i = 0; i < 4; i++, left += 8) {
+    h_predictor_32x8(dst, stride, left);
+    dst += stride << 3;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+  const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 4; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+                                const __m128i *dc) {
+  const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+  int i;
+  for (i = 0; i < 8; ++i, dst += stride) {
+    _mm_storel_epi64((__m128i *)dst, dc_dup);
+  }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+  const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+  const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+  const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+  const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+  return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i sum = dc_sum_8(left);
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)left;
+  (void)bd;
+  dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, const uint16_t *above) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i two = _mm_cvtsi32_si128(2);
+  const __m128i sum = dc_sum_4(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+  const __m128i sum_lo = dc_sum_8(ref);
+  const __m128i sum_hi = dc_sum_8(ref + 8);
+  return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+                                        int height, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                          const uint16_t *above,
+                                          const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  (void)above;
+  (void)left;
+  dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i four = _mm_cvtsi32_si128(4);
+  const __m128i sum = dc_sum_8(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sum_a = dc_sum_16(ref);
+  const __m128i sum_b = dc_sum_16(ref + 16);
+  // 12 bit bd will outrange, so expand to 32 bit before adding final total
+  return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+                       _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(above);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)left;
+  (void)bd;
+  dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                           const uint16_t *above,
+                                           const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+                                 const __m128i *dc) {
+  const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+  const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+  int i;
+  for (i = 0; i < height; ++i, dst += stride) {
+    _mm_store_si128((__m128i *)dst, dc_dup);
+    _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+    _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+  }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i eight = _mm_cvtsi32_si128(8);
+  const __m128i sum = dc_sum_16(left);
+  const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(left);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)above;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i sixteen = _mm_cvtsi32_si128(16);
+  const __m128i sum = dc_sum_32(above);
+  const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+  (void)left;
+  (void)bd;
+  dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                            const uint16_t *above,
+                                            const uint16_t *left, int bd) {
+  const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+  const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+  (void)above;
+  (void)left;
+  dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_storel_epi64((__m128i *)dst, above_u16);
+    _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                     const uint16_t *above,
+                                     const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  _mm_store_si128((__m128i *)dst, above_u16);
+  _mm_store_si128((__m128i *)(dst + stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+  _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above_u16);
+    _mm_store_si128((__m128i *)(dst + stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+    _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+    dst += stride << 2;
+  }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    dst += stride;
+  }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)left;
+  (void)bd;
+  const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+  const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+  const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, above0_u16);
+    _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+    _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+    _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+    dst += stride;
+  }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_above = dc_sum_4(above);
+  const __m128i sum_left = dc_sum_8(left);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+                                      const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  (void)bd;
+  const __m128i sum_left = dc_sum_4(left);
+  const __m128i sum_above = dc_sum_8(above);
+  const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 >>= 16;
+  sum32 += 6;
+  sum32 /= 12;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+  dst += stride;
+  _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+                                       const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 += 12;
+  sum32 /= 24;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+  int i;
+  for (i = 0; i < 2; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_above = _mm_unpacklo_epi16(sum_above, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+  int i;
+  for (i = 0; i < 8; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    dst += stride;
+  }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+                                        const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  (void)bd;
+  __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i zero = _mm_setzero_si128();
+  sum_left = _mm_unpacklo_epi16(sum_left, zero);
+  const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+  uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+  sum32 += 24;
+  sum32 /= 48;
+  const __m128i row = _mm_set1_epi16((int16_t)sum32);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+    _mm_store_si128((__m128i *)dst, row);
+    _mm_store_si128((__m128i *)(dst + 8), row);
+    _mm_store_si128((__m128i *)(dst + 16), row);
+    _mm_store_si128((__m128i *)(dst + 24), row);
+    dst += stride;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 0000000000..c954da94e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+                                         blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                       limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                        limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+                                      limit1, thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..ea7dc6a9e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+                                         __m128i *pixel) {
+  *pixel = _mm_min_epi16(*pixel, *max);
+  *pixel = _mm_max_epi16(*pixel, *min);
+}
+
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+                             const uint8_t *t, int bd, __m128i *blt,
+                             __m128i *lt, __m128i *thr, __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+  *blt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+  *lt = _mm_slli_epi16(x, shift);
+
+  x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+  *thr = _mm_slli_epi16(x, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+    const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+    const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+    int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+    __m128i *t80_out) {
+  const int shift = bd - 8;
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i x0 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+  __m128i x1 =
+      _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *blt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *lt_out = _mm_slli_epi16(x0, shift);
+
+  x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+  x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+  x0 = _mm_unpacklo_epi64(x0, x1);
+  *thr_out = _mm_slli_epi16(x0, shift);
+
+  *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+                                     __m128i *p, __m128i *q) {
+  int i;
+  for (i = 0; i < size; i++) {
+    p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
+  }
+}
+
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+                                           const __m128i *l, const __m128i *bl,
+                                           __m128i *mask) {
+  __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+  __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+
+  __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  int i;
+  for (i = 1; i < 4; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
+  }
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  // return ~mask
+}
+
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+                                                 __m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *abs_p1p0, __m128i *l,
+                                                 __m128i *bl, __m128i *t,
+                                                 __m128i *hev, __m128i *mask) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+  __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+  __m128i max, max01, h;
+
+  *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+  *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+  abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+  abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+  abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);  // divide by 2
+
+  max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+  max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+  *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+  abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+  max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+  // mask |= (abs(*p1 - *p0) > limit) * -1;
+  // mask |= (abs(*q1 - *q0) > limit) * -1;
+  h = _mm_subs_epu16(max01, *t);
+
+  *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+  // replicate for the further "merged variables" usage
+  *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+  max = _mm_max_epi16(max, max01);
+  int i;
+  for (i = 2; i < x; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  max = _mm_subs_epu16(max, *l);
+  *mask = _mm_cmpeq_epi16(max, zero);  //  ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+                                      int start, int end, __m128i *flat) {
+  int i;
+  __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+                              abs_diff16(pq[start + 1], pq[0]));
+
+  for (i = start + 2; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+  }
+  max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+                                           const __m128i *q, int start, int end,
+                                           __m128i *flat) {
+  int i;
+  __m128i max =
+      _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+  for (i = start + 1; i < end; ++i) {
+    max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+    max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
+  }
+
+  __m128i ft;
+  ft = _mm_subs_epu16(max, *th);
+
+  const __m128i zero = _mm_setzero_si128();
+  *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+                                          __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal(&th, pq, 1, 4, flat);
+  flat_mask_internal(&th, pq, 4, 7, flat2);
+}
+
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+                                               const __m128i *q, __m128i *flat,
+                                               __m128i *flat2, int bd) {
+  // check the distance 1,2,3 against 0
+  __m128i th = _mm_set1_epi16(1);
+  th = _mm_slli_epi16(th, bd - 8);
+  flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+  flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
+}
+
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                                 __m128i *hev, __m128i *mask,
+                                                 __m128i *qs1qs0,
+                                                 __m128i *ps1ps0, __m128i *t80,
+                                                 int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+  const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+  __m128i ps1ps0_work, qs1qs0_work, work;
+  __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+  ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+  qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+  work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &work);
+  filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  filt = _mm_subs_epi16(filt, work);
+  // (aom_filter + 3 * (qs0 - ps0)) & mask
+  pixel_clamp(&pmin, &pmax, &filt);
+  filt = _mm_and_si128(filt, *mask);
+  filt = _mm_unpacklo_epi64(filt, filt);
+
+  filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+  pixel_clamp(&pmin, &pmax, &filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
+
+  filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+  // filt >> 1
+  filt = _mm_adds_epi16(filt, one);
+  filt = _mm_srai_epi16(filt, 1);
+  filt = _mm_andnot_si128(*hev, filt);
+
+  filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+  filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+  qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+  ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+  pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+  pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+  *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+  *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+                                            __m128i *qs, const __m128i *mask,
+                                            const __m128i *th, int bd,
+                                            __m128i *t80) {
+  __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+  __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+  __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+  __m128i qs1 = _mm_subs_epi16(q[1], *t80);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i pmax =
+      _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i pmin = _mm_subs_epi16(zero, *t80);
+  __m128i filter = _mm_subs_epi16(ps1, qs1);
+  pixel_clamp(&pmin, &pmax, &filter);
+
+  // hev_filter
+  __m128i hev;
+  const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+  const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+  __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  h = _mm_subs_epu16(h, *th);
+  const __m128i ffff = _mm_cmpeq_epi16(h, h);
+  hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
+  filter = _mm_and_si128(filter, hev);
+
+  const __m128i x = _mm_subs_epi16(qs0, ps0);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  filter = _mm_adds_epi16(filter, x);
+  pixel_clamp(&pmin, &pmax, &filter);
+  filter = _mm_and_si128(filter, *mask);
+  const __m128i t3 = _mm_set1_epi16(3);
+  const __m128i t4 = _mm_set1_epi16(4);
+  __m128i filter1 = _mm_adds_epi16(filter, t4);
+  __m128i filter2 = _mm_adds_epi16(filter, t3);
+  pixel_clamp(&pmin, &pmax, &filter1);
+  pixel_clamp(&pmin, &pmax, &filter2);
+  filter1 = _mm_srai_epi16(filter1, 3);
+  filter2 = _mm_srai_epi16(filter2, 3);
+  qs0 = _mm_subs_epi16(qs0, filter1);
+  pixel_clamp(&pmin, &pmax, &qs0);
+  ps0 = _mm_adds_epi16(ps0, filter2);
+  pixel_clamp(&pmin, &pmax, &ps0);
+  qs[0] = _mm_adds_epi16(qs0, *t80);
+  ps[0] = _mm_adds_epi16(ps0, *t80);
+  filter = _mm_adds_epi16(filter1, one);
+  filter = _mm_srai_epi16(filter, 1);
+  filter = _mm_andnot_si128(hev, filter);
+  qs1 = _mm_subs_epi16(qs1, filter);
+  pixel_clamp(&pmin, &pmax, &qs1);
+  ps1 = _mm_adds_epi16(ps1, filter);
+  pixel_clamp(&pmin, &pmax, &ps1);
+  qs[1] = _mm_adds_epi16(qs1, *t80);
+  ps[1] = _mm_adds_epi16(ps1, *t80);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+    __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+    const unsigned char *lt, const unsigned char *thr, int bd) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i t80;
+  get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+  for (i = 0; i < 7; i++) {
+    pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+  }
+  __m128i mask, hevhev;
+  __m128i p1p0, q1q0, abs_p1p0;
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hevhev, &mask);
+
+  __m128i ps0ps1, qs0qs1;
+  // filter4
+  highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
+
+  __m128i flat, flat2;
+  highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+  flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3], flat_pq[3];
+    __m128i flat2_p[6], flat2_q[6];
+    __m128i flat2_pq[6];
+    __m128i sum_p6, sum_p3;
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+    __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+    sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+    sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+    work0_1 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+    work0 = _mm_add_epi16(sum_p3, pq[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+    work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+      work0 = _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+    }  // flat2
+       // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // highbd_filter8
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+    for (i = 0; i < 3; i++) {
+      pq[i] = _mm_andnot_si128(flat, pq[i]);
+      flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+      pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+    }
+
+    // wide flat
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        pq[i] = _mm_andnot_si128(flat2, pq[i]);
+        flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+        pq[i] = _mm_or_si128(pq[i], flat2_pq[i]);  // full list of pq values
+      }
+    }
+  } else {
+    pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+    pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+                                       const uint8_t *blimit,
+                                       const uint8_t *limit,
+                                       const uint8_t *thresh, int bd) {
+  __m128i p[7], q[7], pq[7];
+  int i;
+
+  for (i = 0; i < 7; i++) {
+    p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+    q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+  }
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+    _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+    __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+    const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+    const uint8_t *thr1, int bd) {
+  __m128i blimit, limit, thresh, t80;
+  const __m128i zero = _mm_setzero_si128();
+
+  get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+                 &t80);
+  __m128i mask;
+  highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+  __m128i flat, flat2;
+  highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+  flat = _mm_and_si128(flat, mask);
+  flat2 = _mm_and_si128(flat2, flat);
+  __m128i ps[2], qs[2];
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+  // flat and wide flat calculations
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+    __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
+    __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+    sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
+    __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+    sum_q = _mm_add_epi16(sum_q, sum_lq);
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+    flat_p[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+    flat_q[0] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+    __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+    __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
+    __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+    __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+    __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+    sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+    flat_p[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+    flat_q[1] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+    sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+    sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+    sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+    flat_p[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+    flat_q[2] =
+        _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+    int flat2_mask =
+        (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+    if (flat2_mask) {
+      flat2_p[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+                                               _mm_add_epi16(p[1], q[0]))),
+          4);
+      flat2_q[0] = _mm_srli_epi16(
+          _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+                                               _mm_add_epi16(p[0], q[1]))),
+          4);
+
+      flat2_p[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+          4);
+      flat2_q[1] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[4]);
+      sum_q = _mm_sub_epi16(sum_q, p[4]);
+      flat2_p[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+          4);
+      flat2_q[2] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[3]);
+      sum_q = _mm_sub_epi16(sum_q, p[3]);
+      flat2_p[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+          4);
+      flat2_q[3] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[2]);
+      sum_q = _mm_sub_epi16(sum_q, p[2]);
+      flat2_p[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+          4);
+      flat2_q[4] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+          4);
+      sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+      sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+      sum_p = _mm_sub_epi16(sum_p, q[1]);
+      sum_q = _mm_sub_epi16(sum_q, p[1]);
+      flat2_p[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+          4);
+      flat2_q[5] = _mm_srli_epi16(
+          _mm_add_epi16(
+              sum_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+          4);
+    }
+    // highbd_filter8
+    int i;
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    p[2] = _mm_andnot_si128(flat, p[2]);
+    //  p2 remains unchanged if !(flat && mask)
+    flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+    //  when (flat && mask)
+    p[2] = _mm_or_si128(p[2], flat_p[2]);  // full list of p2 values
+    q[2] = _mm_andnot_si128(flat, q[2]);
+    flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+    q[2] = _mm_or_si128(q[2], flat_q[2]);  // full list of q2 values
+
+    for (i = 0; i < 2; i++) {
+      ps[i] = _mm_andnot_si128(flat, ps[i]);
+      flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+      p[i] = _mm_or_si128(ps[i], flat_p[i]);
+      qs[i] = _mm_andnot_si128(flat, qs[i]);
+      flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+      q[i] = _mm_or_si128(qs[i], flat_q[i]);
+    }
+    // highbd_filter16
+    if (flat2_mask) {
+      for (i = 0; i < 6; i++) {
+        //  p[i] remains unchanged if !(flat2 && flat && mask)
+        p[i] = _mm_andnot_si128(flat2, p[i]);
+        flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+        //  get values for when (flat2 && flat && mask)
+        p[i] = _mm_or_si128(p[i], flat2_p[i]);  // full list of p values
+        q[i] = _mm_andnot_si128(flat2, q[i]);
+        flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+        q[i] = _mm_or_si128(q[i], flat2_q[i]);
+      }
+    }
+  } else {
+    p[0] = ps[0];
+    q[0] = qs[0];
+    p[1] = ps[1];
+    q[1] = qs[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p[7], q[7];
+  int i;
+  load_highbd_pixel(s, 7, pitch, p, q);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+                                   _limit1, _thresh1, bd);
+
+  for (i = 0; i < 6; i++) {
+    _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+    _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+    const uint8_t *_limit, const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[3];
+  __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+  __m128i flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq[1], pq[2]);                  // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_p1p0 = _mm_srli_epi16(workp_b, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+                            pq[1]);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+                            pq[0]);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    flat_q0q1 = _mm_srli_epi16(workp_a, 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+    __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+    const unsigned char *_thresh0, const unsigned char *_blimit1,
+    const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat, work;
+  __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+  __m128i op1, op0, oq0, oq1;
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p2p1 = abs_diff16(*p2, *p1);
+  abs_p1p0 = abs_diff16(*p1, *p0);
+  abs_q1q0 = abs_diff16(*q1, *q0);
+  abs_q2q1 = abs_diff16(*q2, *q1);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  mask = _mm_max_epi16(abs_q2q1, mask);
+  work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+  mask = _mm_max_epi16(work, mask);
+  mask = _mm_max_epi16(mask, abs_p2p1);
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  // flat_mask
+  flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+  flat = _mm_max_epi16(flat, work);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+                            _mm_add_epi16(*p1, *p1));  // *p0 *2 + *p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            *p2);  // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+    op1 = _mm_srli_epi16(workp_shft0, 3);
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1);  // *q0 * 2 + *q1
+    workp_a =
+        _mm_add_epi16(workp_a,
+                      workp_b);  // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+    op0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+                            *p1);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 + 4
+    workp_b = _mm_add_epi16(*q1, *q2);
+    workp_shft0 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0 * 2 + *p1  + *q0 * 2 + *q1 * 2 + *q2 + 4
+    oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+                            *p0);  // *p0   + *q0 * 2 + *q1 * 2 + *q2 + 4
+    workp_b = _mm_add_epi16(*q2, *q2);
+    workp_shft1 = _mm_add_epi16(
+        workp_a, workp_b);  // *p0  + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+    oq1 = _mm_srli_epi16(workp_shft1, 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+                             _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    const unsigned char *_blimit, const unsigned char *_limit,
+    const unsigned char *_thresh, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev, flat;
+  __m128i pq[4];
+  __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+  __m128i work_a, opq2, flat_p1p0, flat_q0q1;
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+  pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+  pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+  __m128i abs_p1p0;
+
+  const __m128i four = _mm_set1_epi16(4);
+  __m128i t80;
+  const __m128i one = _mm_set1_epi16(0x1);
+
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  // lp filter
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+  // flat_mask4
+  flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
+  flat = _mm_max_epi16(abs_p1p0, flat);
+  flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    workp_c = _mm_add_epi16(workp_a, workp_c);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    workp_a = _mm_add_epi16(workp_a, workp_b);
+    opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+    work_a = _mm_andnot_si128(flat, pq[2]);
+    *p2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_a, *p2);
+    *q2 = _mm_srli_si128(*p2, 8);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+    const unsigned char *_limit0, const unsigned char *_thresh0,
+    const unsigned char *_blimit1, const unsigned char *_limit1,
+    const unsigned char *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i t80;
+  __m128i mask, flat;
+  __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+  __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i four = _mm_set1_epi16(4);
+  const __m128i one = _mm_set1_epi16(0x1);
+  const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  abs_p0q0 = abs_diff16(*p0, *q0);
+  abs_p1q1 = abs_diff16(*p1, *q1);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2  > blimit) * -1;
+
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+  work1 =
+      _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0));  // tbu 4 flat
+  work0 = _mm_max_epi16(work0, work1);
+  work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+  work2 = _mm_max_epi16(work2, work0);
+  mask = _mm_max_epi16(work2, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  // lp filter
+  __m128i ps[2], qs[2], p[2], q[2];
+  {
+    p[0] = *p0;
+    p[1] = *p1;
+    q[0] = *q0;
+    q[1] = *q1;
+    // filter_mask and hev_mask
+    highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+  }
+
+  flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+  flat = _mm_max_epi16(work1, flat);
+  work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+  flat = _mm_max_epi16(work0, flat);
+
+  flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+  flat = _mm_cmpeq_epi16(flat, zero);
+  flat = _mm_and_si128(flat, mask);  // flat & mask
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+    __m128i workp_a, workp_b;
+    // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+    // o*p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+    op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p1
+    workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+    op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // o*p0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+    op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+    oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+    oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+    oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    qs[0] = _mm_andnot_si128(flat, qs[0]);
+    oq0 = _mm_and_si128(flat, oq0);
+    *q0 = _mm_or_si128(qs[0], oq0);
+
+    qs[1] = _mm_andnot_si128(flat, qs[1]);
+    oq1 = _mm_and_si128(flat, oq1);
+    *q1 = _mm_or_si128(qs[1], oq1);
+
+    ps[0] = _mm_andnot_si128(flat, ps[0]);
+    op0 = _mm_and_si128(flat, op0);
+    *p0 = _mm_or_si128(ps[0], op0);
+
+    ps[1] = _mm_andnot_si128(flat, ps[1]);
+    op1 = _mm_and_si128(flat, op1);
+    *p1 = _mm_or_si128(ps[1], op1);
+
+    work_a = _mm_andnot_si128(flat, *q2);
+    *q2 = _mm_and_si128(flat, oq2);
+    *q2 = _mm_or_si128(work_a, *q2);
+
+    work_a = _mm_andnot_si128(flat, *p2);
+    *p2 = _mm_and_si128(flat, op2);
+    *p2 = _mm_or_si128(work_a, *p2);
+  } else {
+    *q0 = qs[0];
+    *q1 = qs[1];
+    *p0 = ps[0];
+    *p1 = ps[1];
+  }
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+                             &p1p0, _blimit, _limit, _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+  highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+                                  _blimit0, _limit0, _thresh0, _blimit1,
+                                  _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+  _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+    __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+    const uint8_t *_thresh, int bd) {
+  __m128i blimit, limit, thresh;
+  __m128i mask, hev;
+  __m128i p1p0, q1q0;
+  __m128i pq[2];
+
+  __m128i abs_p1p0;
+
+  __m128i t80;
+  get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+  pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+  pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+  highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+                                &thresh, &hev, &mask);
+
+  highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+    __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i blimit0, limit0, thresh0;
+  __m128i mask, flat;
+  __m128i p[2], q[2];
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+  __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+  __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+  __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
+  const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i t80;
+
+  get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+                 &blimit0, &limit0, &thresh0, &t80);
+
+  // filter_mask and hev_mask
+  flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+
+  abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+  mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+  mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+  // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2  > blimit) * -1;
+  // So taking maximums continues to work:
+  mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+  mask = _mm_max_epi16(flat, mask);
+
+  mask = _mm_subs_epu16(mask, limit0);
+  mask = _mm_cmpeq_epi16(mask, zero);
+
+  p[0] = *p0;
+  p[1] = *p1;
+  q[0] = *q0;
+  q[1] = *q1;
+
+  highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+                                      const uint8_t *_blimit,
+                                      const uint8_t *_limit,
+                                      const uint8_t *_thresh, int bd) {
+  __m128i p1p0, q1q0;
+  __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+                             _thresh, bd);
+
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  __m128i ps[2], qs[2];
+
+  highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+                                  _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+  _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+  _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+  _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+  _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+  __m128i p1p0, q1q0;
+  __m128i p1, q1;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
+
+  highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+                             thresh, bd);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  // transpose from 8x4 to 4x8
+  highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i ps[2], qs[2];
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+  highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                               &d2, &d3);
+
+  highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+                                  thresh0, blimit1, limit1, thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+                               &d3, &d4, &d5, &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x3, x2, x1, x0, p0, q0;
+  __m128i p1p0, q1q0;
+
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+                             limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+    uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1, int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0, p1, q1, p2, q2;
+
+  x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+                           &p0, &q0, &q1, &q2, &d6, &d7);
+
+  highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+                                  _limit0, _thresh0, _blimit1, _limit1,
+                                  _thresh1, bd);
+
+  highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+  _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+  _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+  _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+  _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+                                    const uint8_t *limit, const uint8_t *thresh,
+                                    int bd) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i p2, p1, p0, p3, q0;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+  p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+  p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
+
+  highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+                               &d6, &d7);
+
+  // Loop filtering
+  highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+                             &p1p0, blimit, limit, thresh, bd);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+                               &d1, &d2, &d3);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+    uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+  highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+                           &d2, &d3, &d4, &d5, &d6, &d7);
+
+  highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+                                  blimit0, limit0, thresh0, blimit1, limit1,
+                                  thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+                           &x2, &x3, &x4, &x5, &x6, &x7);
+
+  _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+  _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+  _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+  _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+  _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+  _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+  _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+  _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
+
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+                                     const uint8_t *blimit,
+                                     const uint8_t *limit,
+                                     const uint8_t *thresh, int bd) {
+  __m128i q[7], p[7], pq[7];
+  __m128i p6, p5, p4, p3;
+  __m128i p6_2, p5_2, p4_2, p3_2;
+  __m128i d0, d1, d2, d3;
+  __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+                               &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+  highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+                               &q[3], &q[4], &q[5], &q[6], &d7_2);
+
+  highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+  highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+                               &pq[1], &pq[0], &d0, &d1, &d2, &d3);
+
+  q[0] = _mm_srli_si128(pq[0], 8);
+  q[1] = _mm_srli_si128(pq[1], 8);
+  q[2] = _mm_srli_si128(pq[2], 8);
+  q[3] = _mm_srli_si128(pq[3], 8);
+  q[4] = _mm_srli_si128(pq[4], 8);
+  q[5] = _mm_srli_si128(pq[5], 8);
+
+  highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+                               &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+    uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+    const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+    const uint8_t *thresh1, int bd) {
+  __m128i q[7], p[7];
+  __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+  __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+  __m128i d0, d7;
+  __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+  p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+  p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+  p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+  p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+  p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+  p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+  p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+  q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+                           &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+  p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+  p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+  p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+  p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+  p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+  p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+  p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+  q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+  highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+                           &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+                           &q[6], &d7);
+
+  highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+                                   limit1, thresh1, bd);
+
+  highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+  highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+                           &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+                           &d6_out, &d7_out);
+
+  _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+  _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+  _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+  _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+  _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+  _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+  _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+  _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..950465cf46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+  const __m128i sign = _mm_srai_epi16(*p, 15);
+  const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+  const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+  *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+  int i;
+  for (i = 0; i < 5; ++i) {
+    qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+  }
+}
+
+static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+                           const int16_t *quant_ptr, const int16_t *dequant_ptr,
+                           const int16_t *quant_shift_ptr, __m256i *qp,
+                           int log_scale) {
+  const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+  const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+  const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+  const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+  const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+  init_one_qp(&zbin, &qp[0]);
+  init_one_qp(&round, &qp[1]);
+  init_one_qp(&quant, &qp[2]);
+  init_one_qp(&dequant, &qp[3]);
+  init_one_qp(&quant_shift, &qp[4]);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+    qp[0] = _mm256_add_epi32(qp[0], rnd);
+    qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+    qp[1] = _mm256_add_epi32(qp[1], rnd);
+    qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+  // calculating the zbin mask.
+  qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16.  The output, 16 int32_t is save in *p.
+static INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+                                            const __m256i *y) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+  prod_lo = _mm256_srli_epi64(prod_lo, 16);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16);
+
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+                                                 __m256i eobmax,
+                                                 __m256i nz_mask) {
+  const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+  const __m256i packed_nz_mask_perm =
+      _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+  const __m256i iscan =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+  const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
+  const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+  return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+  __m256i eob_s;
+  eob_s = _mm256_shuffle_epi32(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+  eob = _mm256_max_epi16(eob, eob_s);
+  eob_s = _mm256_shufflelo_epi16(eob, 1);
+  eob = _mm256_max_epi16(eob, eob_s);
+  return (uint16_t)_mm256_extract_epi16(eob, 0);
+}
+
+static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+                                                               const __m256i *y,
+                                                               int log_scale) {
+  __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+  __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+  const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+  prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+  prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+  const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+  prod_lo = _mm256_and_si256(prod_lo, mask);
+  prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+  prod_hi = _mm256_slli_epi64(prod_hi, 32);
+  return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE void quantize_logscale(
+    const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+    tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+  // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+  const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+  const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+  // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+  //                              (16 - log_scale + AOM_QM_BITS));
+  const __m256i abs_q =
+      mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale);
+  const __m256i abs_dq =
+      _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+static AOM_FORCE_INLINE void quantize(const __m256i *qp,
+                                      const tran_low_t *coeff_ptr,
+                                      const int16_t *iscan_ptr,
+                                      tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                                      __m256i *eob) {
+  const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+  const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+  const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+  if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+    const __m256i zero = _mm256_setzero_si256();
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+    return;
+  }
+
+  const __m256i tmp_rnd =
+      _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+  const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+  const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+  const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+  const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+  const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+  const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+  const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+  _mm256_storeu_si256((__m256i *)qcoeff, q);
+  _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+  *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  const int step = 8;
+
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+  quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_32x32_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  const unsigned int step = 8;
+
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+  quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_64x64_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  const int step = 8;
+
+  __m256i eob = _mm256_setzero_si256();
+  __m256i qp[5];
+  init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2);
+
+  quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+  coeff_ptr += step;
+  qcoeff_ptr += step;
+  dqcoeff_ptr += step;
+  iscan += step;
+  n_coeffs -= step;
+
+  update_qp(qp);
+
+  while (n_coeffs > 0) {
+    quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+    coeff_ptr += step;
+    qcoeff_ptr += step;
+    dqcoeff_ptr += step;
+    iscan += step;
+    n_coeffs -= step;
+  }
+
+  *eob_ptr = get_max_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..3b0c42c4f5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+  __m128i zbins[2];
+  __m128i nzbins[2];
+
+  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+                           (int)zbin_ptr[0]);
+  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  (void)scan;
+
+  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = ((int)count / 4) - 1; i >= 0; i--) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (test == 0xffff)
+      non_zero_regs--;
+    else
+      break;
+  }
+
+  // Quantization pass:
+  for (i = 0; i < non_zero_regs; i++) {
+    __m128i coeffs, coeffs_sign, tmp1, tmp2;
+    int test;
+    int abs_coeff[4];
+    int coeff_sign[4];
+
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    coeffs_sign = _mm_srai_epi32(coeffs, 31);
+    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+    tmp1 = _mm_or_si128(tmp1, tmp2);
+    test = _mm_movemask_epi8(tmp1);
+    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+    for (j = 0; j < 4; j++) {
+      if (test & (1 << (4 * j))) {
+        int k = 4 * i + j;
+        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+        qcoeff_ptr[k] =
+            (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+      }
+    }
+  }
+  *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_64x64_sse2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan) {
+  __m128i zbins[2];
+  __m128i nzbins[2];
+  int idx = 0;
+  int idx_arr[1024];
+  int i, eob = -1;
+  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+  (void)scan;
+  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+  zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+  nzbins[0] = _mm_setzero_si128();
+  nzbins[1] = _mm_setzero_si128();
+  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  // Pre-scan pass
+  for (i = 0; i < n_coeffs / 4; i++) {
+    __m128i coeffs, cmp1, cmp2;
+    int test;
+    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+    cmp1 = _mm_and_si128(cmp1, cmp2);
+    test = _mm_movemask_epi8(cmp1);
+    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+  }
+
+  // Quantization pass: only process the coefficients selected in
+  // pre-scan pass. Note: idx can be zero.
+  for (i = 0; i < idx; i++) {
+    const int rc = idx_arr[i];
+    const int coeff = coeff_ptr[rc];
+    const int coeff_sign = AOMSIGN(coeff);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+    const uint32_t abs_qcoeff =
+        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+  }
+  *eob_ptr = eob + 1;
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..03839b493c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,344 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+  movh                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m4, [ref1q+%5*2]
+  movhps                m5, [ref2q+%5*2]
+  movhps                m6, [ref3q+%5*2]
+  movhps                m7, [ref4q+%5*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  movu                  m2, [ref1q+%3*2]
+  movhps                m0, [srcq +%4*2]
+  movhps                m2, [ref1q+%5*2]
+  mova                  m3, m0
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+
+  movu                  m2, [ref2q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref2q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+
+  movu                  m2, [ref3q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref3q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+
+  movu                  m2, [ref4q+%3*2]
+  mova                  m3, m0
+  movhps                m2, [ref4q+%5*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+  ; 1st 8 px
+  mova                  m0, [srcq +%2*2]
+%if %1 == 1
+  movu                  m4, [ref1q+%3*2]
+  movu                  m5, [ref2q+%3*2]
+  movu                  m6, [ref3q+%3*2]
+  movu                  m7, [ref4q+%3*2]
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m4
+  psubusw               m2, m5
+  psubusw               m4, m0
+  psubusw               m5, m0
+  por                   m4, m3
+  por                   m5, m2
+  pmaddwd               m4, m1
+  pmaddwd               m5, m1
+  mova                  m3, m0
+  mova                  m2, m0
+  psubusw               m3, m6
+  psubusw               m2, m7
+  psubusw               m6, m0
+  psubusw               m7, m0
+  por                   m6, m3
+  por                   m7, m2
+  pmaddwd               m6, m1
+  pmaddwd               m7, m1
+%else
+  mova                  m3, m0
+  movu                  m2, [ref1q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+%3*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endif
+
+  ; 2nd 8 px
+  mova                  m0, [srcq +(%4)*2]
+  mova                  m3, m0
+  movu                  m2, [ref1q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m4, m2
+  movu                  m2, [ref2q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m5, m2
+  movu                  m2, [ref3q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+  por                   m2, m3
+  mova                  m3, m0
+  pmaddwd               m2, m1
+  paddd                 m6, m2
+  movu                  m2, [ref4q+(%5)*2]
+  psubusw               m3, m2
+  psubusw               m2, m0
+%if %6 == 1
+  lea                 srcq, [srcq +src_strideq*4]
+  lea                ref1q, [ref1q+ref_strideq*4]
+  lea                ref2q, [ref2q+ref_strideq*4]
+  lea                ref3q, [ref3q+ref_strideq*4]
+  lea                ref4q, [ref4q+ref_strideq*4]
+%endif
+  por                   m2, m3
+  pmaddwd               m2, m1
+  paddd                 m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0  ; normal sad
+%if AOM_ARCH_X86_64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ; AOM_ARCH_X86_64
+%else  ; %3 == 2, downsample
+%if AOM_ARCH_X86_64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+                              res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+                              ref2, ref3, ref4
+%endif  ; AOM_ARCH_X86_64
+%endif  ; sad/avg/skip
+
+; set m1
+  push                srcq
+  mov                 srcd, 0x00010001
+  movd                  m1, srcd
+  pshufd                m1, m1, 0x0
+  pop                 srcq
+
+%if %3 == 2  ; skip rows
+  lea          src_strided, [2*src_strided]
+  lea          ref_strided, [2*ref_strided]
+%endif  ; skip rows
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+  shl                 srcq, 1
+  shl                ref2q, 1
+  shl                ref3q, 1
+  shl                ref4q, 1
+  shl                ref1q, 1
+
+  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 2  ;  Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef rep
+  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+  movhlps               m0, m4
+  movhlps               m1, m5
+  movhlps               m2, m6
+  movhlps               m3, m7
+  paddd                 m4, m0
+  paddd                 m5, m1
+  paddd                 m6, m2
+  paddd                 m7, m3
+  punpckldq             m4, m5
+  punpckldq             m6, m7
+  movhlps               m0, m4
+  movhlps               m1, m6
+  paddd                 m4, m0
+  paddd                 m6, m1
+  punpcklqdq            m4, m6
+%if %3 == 2  ; skip rows
+  pslld                 m4, 1
+%endif
+  movifnidn             r4, r4mp
+  movu                [r4], m4
+  RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16,  8
+HIGH_SADNXN4D  8, 16
+HIGH_SADNXN4D  8,  8
+HIGH_SADNXN4D  8,  4
+HIGH_SADNXN4D  4,  8
+HIGH_SADNXN4D  4,  4
+HIGH_SADNXN4D  4, 16
+HIGH_SADNXN4D 16,  4
+HIGH_SADNXN4D  8, 32
+HIGH_SADNXN4D 32,  8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16,  8, 2
+HIGH_SADNXN4D  8, 16, 2
+HIGH_SADNXN4D  8,  8, 2
+HIGH_SADNXN4D  4,  8, 2
+HIGH_SADNXN4D  4, 16, 2
+HIGH_SADNXN4D  8, 32, 2
+HIGH_SADNXN4D 32,  8, 2
+HIGH_SADNXN4D 16, 64, 2
+HIGH_SADNXN4D 64, 16, 2
+
+; Current code cannot handle the case when the height is downsampled to 2
+; HIGH_SADNXN4D 16,  4, 2
+; HIGH_SADNXN4D  8,  4, 2
+; HIGH_SADNXN4D  4,  4, 2
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..6c78eeeefb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+  // input 8 32-bit summation
+  __m128i lo128, hi128;
+  __m256i u = _mm256_srli_si256(*v, 8);
+  u = _mm256_add_epi32(u, *v);
+
+  // 4 32-bit summation
+  hi128 = _mm256_extracti128_si256(u, 1);
+  lo128 = _mm256_castsi256_si128(u);
+  lo128 = _mm_add_epi32(hi128, lo128);
+
+  // 2 32-bit summation
+  hi128 = _mm_srli_si128(lo128, 4);
+  lo128 = _mm_add_epi32(lo128, hi128);
+
+  return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+                                            __m256i *sad_acc) {
+  const __m256i zero = _mm256_setzero_si256();
+  int i;
+  for (i = 0; i < 4; i++) {
+    s[i] = _mm256_sub_epi16(s[i], r[i]);
+    s[i] = _mm256_abs_epi16(s[i]);
+  }
+
+  s[0] = _mm256_add_epi16(s[0], s[1]);
+  s[0] = _mm256_add_epi16(s[0], s[2]);
+  s[0] = _mm256_add_epi16(s[0], s[3]);
+
+  r[0] = _mm256_unpacklo_epi16(s[0], zero);
+  r[1] = _mm256_unpackhi_epi16(s[0], zero);
+
+  r[0] = _mm256_add_epi32(r[0], r[1]);
+  *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+                           const uint16_t *ref_ptr, int ref_stride,
+                           const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+  s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+  s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+  s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+  r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+  r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+  r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+  r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+  if (sec_ptr) {
+    r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+    r[1] = _mm256_avg_epu16(
+        r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+    r[2] = _mm256_avg_epu16(
+        r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+    r[3] = _mm256_avg_epu16(
+        r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+  }
+  highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+  int i;
+  __m256i sad = _mm256_setzero_si256();
+  for (i = 0; i < N; i += 4) {
+    sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
+    src_ptr += src_stride << 2;
+    ref_ptr += ref_stride << 2;
+  }
+  return (unsigned int)get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int row_sections = 0;
+
+  while (row_sections < 2) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 32 << 1;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+
+    row_sections += 1;
+    src_ptr += src_stride << 1;
+    ref_ptr += ref_stride << 1;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 2;
+  int i;
+
+  for (i = 0; i < N; i += 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+                    const uint16_t *ref_ptr, int ref_stride,
+                    const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+                                                             const uint8_t *src,
+                                                             int src_stride,
+                                                             const uint8_t *ref,
+                                                             int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  const int left_shift = 1;
+  int i;
+  for (i = 0; i < N; i += 2) {
+    sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+                     const uint16_t *sec_ptr, __m256i *sad_acc) {
+  __m256i s[4], r[4];
+  int i;
+  for (i = 0; i < 2; i++) {
+    s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+    s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+    s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+    s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+    r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+    r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+    r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+    r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+    if (sec_ptr) {
+      r[0] =
+          _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+      r[1] = _mm256_avg_epu16(
+          r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+      r[2] = _mm256_avg_epu16(
+          r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+      r[3] = _mm256_avg_epu16(
+          r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+      sec_ptr += 64;
+    }
+    highbd_sad16x4_core_avx2(s, r, sad_acc);
+    src_ptr += 64;
+    ref_ptr += 64;
+  }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+    int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+    int ref_stride) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  int row = 0;
+  while (row < N) {
+    sad128x1(srcp, refp, NULL, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    row++;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+#define HIGHBD_SADMXN_AVX2(m, n)                                            \
+  unsigned int aom_highbd_sad##m##x##n##_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *ref,               \
+      int ref_stride) {                                                     \
+    return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
+  }
+
+#define HIGHBD_SAD_SKIP_MXN_AVX2(m, n)                                       \
+  unsigned int aom_highbd_sad_skip_##m##x##n##_avx2(                         \
+      const uint8_t *src, int src_stride, const uint8_t *ref,                \
+      int ref_stride) {                                                      \
+    return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
+                                          2 * ref_stride);                   \
+  }
+
+HIGHBD_SADMXN_AVX2(16, 4)
+HIGHBD_SADMXN_AVX2(16, 8)
+HIGHBD_SADMXN_AVX2(16, 16)
+HIGHBD_SADMXN_AVX2(16, 32)
+HIGHBD_SADMXN_AVX2(16, 64)
+
+HIGHBD_SADMXN_AVX2(32, 8)
+HIGHBD_SADMXN_AVX2(32, 16)
+HIGHBD_SADMXN_AVX2(32, 32)
+HIGHBD_SADMXN_AVX2(32, 64)
+
+HIGHBD_SADMXN_AVX2(64, 16)
+HIGHBD_SADMXN_AVX2(64, 32)
+HIGHBD_SADMXN_AVX2(64, 64)
+HIGHBD_SADMXN_AVX2(64, 128)
+
+HIGHBD_SADMXN_AVX2(128, 64)
+HIGHBD_SADMXN_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
+
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+  // Next 4 rows
+  srcp += src_stride << 2;
+  refp += ref_stride << 2;
+  secp += 64;
+  sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 3;
+  uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                             second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+                                     second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 16 << left_shift;
+  sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+                                         const uint8_t *ref, int ref_stride,
+                                         const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 2) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 2;
+  int row_section = 0;
+
+  while (row_section < 4) {
+    sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 32 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 4;
+  uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 32 << left_shift;
+  sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 8) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  const int left_shift = 1;
+  int row_section = 0;
+
+  while (row_section < 16) {
+    sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+    srcp += src_stride << left_shift;
+    refp += ref_stride << left_shift;
+    secp += 64 << left_shift;
+    row_section += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+                                          const uint8_t *ref, int ref_stride,
+                                          const uint8_t *second_pred) {
+  const int left_shift = 5;
+  uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  const int left_shift = 6;
+  uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                              second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 64 << left_shift;
+  sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+                                           const uint8_t *ref, int ref_stride,
+                                           const uint8_t *second_pred) {
+  __m256i sad = _mm256_setzero_si256();
+  uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+  uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+  uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+  int row = 0;
+  while (row < 64) {
+    sad128x1(srcp, refp, secp, &sad);
+    srcp += src_stride;
+    refp += ref_stride;
+    secp += 16 << 3;
+    row += 1;
+  }
+  return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+                                            const uint8_t *ref, int ref_stride,
+                                            const uint8_t *second_pred) {
+  unsigned int sum;
+  const int left_shift = 6;
+
+  sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                      second_pred);
+  src += src_stride << left_shift;
+  ref += ref_stride << left_shift;
+  second_pred += 128 << left_shift;
+  sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+                                       second_pred);
+  return sum;
+}
+
+// SAD 4D
+// Combine 4 __m256i input vectors  v to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+                                               uint32_t *res) {
+  __m256i u0, u1, u2, u3;
+  const __m256i mask = yy_set1_64_from_32i(~0);
+  __m128i sad;
+
+  // 8 32-bit summation
+  u0 = _mm256_srli_si256(v[0], 4);
+  u1 = _mm256_srli_si256(v[1], 4);
+  u2 = _mm256_srli_si256(v[2], 4);
+  u3 = _mm256_srli_si256(v[3], 4);
+
+  u0 = _mm256_add_epi32(u0, v[0]);
+  u1 = _mm256_add_epi32(u1, v[1]);
+  u2 = _mm256_add_epi32(u2, v[2]);
+  u3 = _mm256_add_epi32(u3, v[3]);
+
+  u0 = _mm256_and_si256(u0, mask);
+  u1 = _mm256_and_si256(u1, mask);
+  u2 = _mm256_and_si256(u2, mask);
+  u3 = _mm256_and_si256(u3, mask);
+  // 4 32-bit summation, evenly positioned
+
+  u1 = _mm256_slli_si256(u1, 4);
+  u3 = _mm256_slli_si256(u3, 4);
+
+  u0 = _mm256_or_si256(u0, u1);
+  u2 = _mm256_or_si256(u2, u3);
+  // 8 32-bit summation, interleaved
+
+  u1 = _mm256_unpacklo_epi64(u0, u2);
+  u3 = _mm256_unpackhi_epi64(u0, u2);
+
+  u0 = _mm256_add_epi32(u1, u3);
+  sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+                      _mm256_castsi256_si128(u0));
+  _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+                             const uint16_t *ref[]) {
+  ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+  ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+  ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+  ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+  s[0] = _mm256_setzero_si256();
+  s[1] = _mm256_setzero_si256();
+  s[2] = _mm256_setzero_si256();
+  s[3] = _mm256_setzero_si256();
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+    int M, int N, int D, const uint8_t *src, int src_stride,
+    const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
+  __m256i sad_vec[4];
+  const uint16_t *refp[4];
+  const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *srcp;
+  const int shift_for_rows = (M < 128) + (M < 64);
+  const int row_units = 1 << shift_for_rows;
+  int i, r;
+
+  init_sad(sad_vec);
+  convert_pointers(ref_array, refp);
+
+  for (i = 0; i < D; ++i) {
+    srcp = keep;
+    for (r = 0; r < N; r += row_units) {
+      if (M == 128) {
+        sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+      } else if (M == 64) {
+        sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+      } else if (M == 32) {
+        sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else if (M == 16) {
+        sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+      } else {
+        assert(0);
+      }
+      srcp += src_stride << shift_for_rows;
+      refp[i] += ref_stride << shift_for_rows;
+    }
+  }
+  get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x4d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
+  }
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n)                                    \
+  void aom_highbd_sad_skip_##m##x##n##x4d_avx2(                              \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+      int ref_stride, uint32_t sad_array[4]) {                               \
+    aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array,  \
+                             2 * ref_stride, sad_array);                     \
+    sad_array[0] <<= 1;                                                      \
+    sad_array[1] <<= 1;                                                      \
+    sad_array[2] <<= 1;                                                      \
+    sad_array[3] <<= 1;                                                      \
+  }
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n)                                          \
+  void aom_highbd_sad##m##x##n##x3d_avx2(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *const ref_array[4],  \
+      int ref_stride, uint32_t sad_array[4]) {                                \
+    aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+                             sad_array);                                      \
+  }
+
+HIGHBD_SAD_MXNX4D_AVX2(16, 4)
+HIGHBD_SAD_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..3dc4e4e0a2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,524 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%elif %4 == 1 ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%else  ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2  ; double the stride if we are skipping rows
+  lea          src_strided, [src_strided*2]
+  lea          ref_strided, [ref_strided*2]
+%endif
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+  shl                 srcq, 1
+  shl                 refq, 1
+%if %4 == 1
+  shl         second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+  HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
+  mov              n_rowsd, %1
+%endif
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  ; first half of each row
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  paddd                 m0, m1
+  paddd                 m0, m3
+  ; second half of each row
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq+64]
+  psubusw               m5, m1
+  psubusw               m1, [srcq+64]
+  por                   m1, m5
+  mova                  m5, [srcq+80]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+80]
+  por                   m2, m5
+  mova                  m5, [srcq+96]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+96]
+  por                   m3, m5
+  mova                  m5, [srcq+112]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+112]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+  HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/2
+%else
+  mov              n_rowsd, %1
+%endif
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+32]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+32]
+  por                   m3, m5
+  mova                  m5, [srcq+48]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+48]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN  8 ; highbd_sad_32x8_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN  8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+HIGH_SAD32XN  8, 2 ; highbd_sad_skip_32x8_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+  HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/4
+%else
+  mov              n_rowsd, %1/2
+%endif
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+16]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*2+16]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m5, [srcq]
+  psubusw               m5, m1
+  psubusw               m1, [srcq]
+  por                   m1, m5
+  mova                  m5, [srcq+16]
+  psubusw               m5, m2
+  psubusw               m2, [srcq+16]
+  por                   m2, m5
+  mova                  m5, [srcq+src_strideq*2]
+  psubusw               m5, m3
+  psubusw               m3, [srcq+src_strideq*2]
+  por                   m3, m5
+  mova                  m5, [srcq+src_strideq*2+16]
+  psubusw               m5, m4
+  psubusw               m4, [srcq+src_strideq*2+16]
+  por                   m4, m5
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN  4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN  4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN  8, 2 ; highbd_sad_skip_16x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD16XN  4, 2 ; highbd_sad_skip_16x4_sse2
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+  HIGH_SAD_FN 8, %1, 7, %2, 8
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
+  mov              n_rowsd, %1/4
+%endif
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq*2]
+  movu                  m3, [refq+ref_strideq*4]
+  movu                  m4, [refq+ref_stride3q*2]
+%if %2 == 1
+  pavgw                 m1, [second_predq+mmsize*0]
+  pavgw                 m2, [second_predq+mmsize*1]
+  pavgw                 m3, [second_predq+mmsize*2]
+  pavgw                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  mova                  m7, m1
+  movu                  m5, [srcq]
+  psubusw               m1, m5
+  psubusw               m5, m7
+  por                   m1, m5
+
+  mova                  m7, m2
+  movu                  m5, [srcq+src_strideq*2]
+  psubusw               m2, m5
+  psubusw               m5, m7
+  por                   m2, m5
+
+  mova                  m7, m3
+  movu                  m5, [srcq+src_strideq*4]
+  psubusw               m3, m5
+  psubusw               m5, m7
+  por                   m3, m5
+
+  mova                  m7, m4
+  movu                  m5, [srcq+src_stride3q*2]
+  psubusw               m4, m5
+  psubusw               m5, m7
+  por                   m4, m5
+
+  paddw                 m1, m2
+  paddw                 m3, m4
+  movhlps               m2, m1
+  movhlps               m4, m3
+  paddw                 m1, m2
+  paddw                 m3, m4
+  punpcklwd             m1, m6
+  punpcklwd             m3, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN  4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN  8, 2 ; highbd_sad_skip_8x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD8XN  4, 2 ; highbd_sad8x4_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+  HIGH_SAD_FN 4, %1, 7, %2
+%if %2 == 2  ; skip rows, so divide number of rows by 2
+  mov              n_rowsd, %1/8
+%else
+  mov              n_rowsd, %1/4
+%endif
+  pxor                  m0, m0
+  pxor                  m6, m6
+
+.loop:
+  movq                  m1, [refq]
+  movq                  m2, [refq+ref_strideq*2]
+  movq                  m3, [refq+ref_strideq*4]
+  movq                  m4, [refq+ref_stride3q*2]
+  punpcklwd             m1, m3
+  punpcklwd             m2, m4
+%if %2 == 1
+  movq                  m3, [second_predq+8*0]
+  movq                  m5, [second_predq+8*2]
+  punpcklwd             m3, m5
+  movq                  m4, [second_predq+8*1]
+  movq                  m5, [second_predq+8*3]
+  punpcklwd             m4, m5
+  lea         second_predq, [second_predq+8*4]
+  pavgw                 m1, m3
+  pavgw                 m2, m4
+%endif
+  movq                  m5, [srcq]
+  movq                  m3, [srcq+src_strideq*4]
+  punpcklwd             m5, m3
+  movdqa                m3, m1
+  psubusw               m1, m5
+  psubusw               m5, m3
+  por                   m1, m5
+  movq                  m5, [srcq+src_strideq*2]
+  movq                  m4, [srcq+src_stride3q*2]
+  punpcklwd             m5, m4
+  movdqa                m4, m2
+  psubusw               m2, m5
+  psubusw               m5, m4
+  por                   m2, m5
+  paddw                 m1, m2
+  movdqa                m2, m1
+  punpcklwd             m1, m6
+  punpckhwd             m2, m6
+  lea                 refq, [refq+ref_strideq*8]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*8]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  punpckldq             m0, m6
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2  ; we skipped rows, so we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN  8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN  4, 1 ; highbd_sad4x4_avg_sse2
+HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
+HIGH_SAD4XN  8, 2 ; highbd_sad_skip_4x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD4XN  4, 2 ; highbd_sad_skip_4x4_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..c0ccc182b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1024 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  mova                 %4, %3       ; make copies to manipulate to calc sum
+  mova                 %2, %1       ; use originals for calc sse
+  pmaddwd              %3, %3
+  paddw                %4, %2
+  pmaddwd              %1, %1
+  movhlps              %2, %4
+  paddd                %6, %3
+  paddw                %4, %2
+  pxor                 %2, %2
+  pcmpgtw              %2, %4       ; mask for 0 > %4 (sum)
+  punpcklwd            %4, %2       ; sign-extend word to dword
+  paddd                %6, %1
+  paddd                %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  movhlps              m3, m7
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  pshufd               m4, m6, 0x1
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  movd               [r1], m7           ; store sse
+  movd                eax, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+  add                srcq, src_stridemp
+%else
+  lea                srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if AOM_ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                      x_offset, y_offset, \
+                                      dst, dst_stride, \
+                                      sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                  x_offset, y_offset, \
+                                  dst, dst_stride, height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ; Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
+  %else
+    %if %2 == 1 ; avg
+      cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                        x_offset, y_offset, \
+                                        dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                    x_offset, y_offset, \
+                                    dst, dst_stride, height, sse
+      %define block_height heightd
+    %endif
+
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+
+%if %1 < 16
+  sar                   block_height, 1
+%endif
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + 16]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m2, [secq+16]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq + src_strideq*2]
+  mova                 m1, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pavgw                m0, m1
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*2+16]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+16]
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m5, [srcq+src_strideq*4]
+  mova                 m4, m1
+  mova                 m2, [dstq]
+  mova                 m3, [dstq+dst_strideq*2]
+  pmullw               m1, filter_y_a
+  pmullw               m5, filter_y_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m1, m5
+  paddw                m0, m4
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonzero:
+  cmp           x_offsetd, 8
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + 16]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  mova                 m2, [dstq]
+  mova                 m3, [dstq + dst_strideq*2]
+  pavgw                m0, m4
+  pavgw                m1, m5
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + 16]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + 18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m1, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + 16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq + src_strideq*2]
+  movu                 m4, [srcq + 2]
+  movu                 m5, [srcq + src_strideq*2 + 2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  pavgw                m0, m2
+  pavgw                m2, m3
+  mova                 m4, [dstq]
+  mova                 m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+y_offsetq]
+  mova                 m9, [bilin_filter+y_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ; x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+  pavgw                m1, m3
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m1, filter_rnd
+  paddw                m1, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m1, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  lea                srcq, [srcq + src_strideq*2]
+  lea                dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  lea                srcq, [srcq + src_strideq*2]
+  pavgw                m0, m2
+.x_half_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pavgw                m2, m4
+  pavgw                m3, m5
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m4, filter_rnd
+  paddw                m4, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  psrlw                m4, 4
+  paddw                m0, m2
+  mova                 m2, [dstq]
+  psrlw                m0, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  lea                srcq, [srcq + src_strideq*4]
+  lea                dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+src_strideq*2]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+src_strideq*2+2]
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m1, m3
+  paddw                m0, m2
+  psrlw                m1, 4
+  psrlw                m0, 4
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m1, [secq]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 8
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+16]
+  movu                 m2, [srcq+2]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+16]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+16]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m1, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m4, m1, m5, m6, m7
+  mova                 m0, m2
+  mova                 m1, m3
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+  lea                srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+  movu                 m2, [srcq]
+  movu                 m3, [srcq+src_strideq*2]
+  movu                 m4, [srcq+2]
+  movu                 m5, [srcq+src_strideq*2+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  mova                 m4, [dstq]
+  mova                 m5, [dstq+dst_strideq*2]
+  psrlw                m2, 4
+  psrlw                m3, 4
+  pavgw                m0, m2
+  pavgw                m2, m3
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m2, [secq]
+%endif
+  SUM_SSE              m0, m4, m2, m5, m6, m7
+  mova                 m0, m3
+
+  lea                srcq, [srcq+src_strideq*4]
+  lea                dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+  mova                 m8, [bilin_filter+x_offsetq]
+  mova                 m9, [bilin_filter+x_offsetq+16]
+  mova                m10, [bilin_filter+y_offsetq]
+  mova                m11, [bilin_filter+y_offsetq+16]
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  movu                 m1, [srcq+16]
+  movu                 m3, [srcq+18]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m0, m2
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  movu                 m3, [srcq+16]
+  movu                 m5, [srcq+18]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m1, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m1, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m1, m3
+  psrlw                m0, 4
+  psrlw                m1, 4
+  mova                 m3, [dstq+16]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  pavgw                m1, [secq+16]
+%endif
+  SUM_SSE              m0, m2, m1, m3, m6, m7
+  mova                 m0, m4
+  mova                 m1, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%else ; %1 < 16
+  movu                 m0, [srcq]
+  movu                 m2, [srcq+2]
+  pmullw               m0, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m2
+  psrlw                m0, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movu                 m2, [srcq]
+  movu                 m4, [srcq+2]
+  INC_SRC_BY_SRC_STRIDE
+  movu                 m3, [srcq]
+  movu                 m5, [srcq+2]
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m3, filter_x_a
+  pmullw               m5, filter_x_b
+  paddw                m3, filter_rnd
+  paddw                m2, m4
+  paddw                m3, m5
+  psrlw                m2, 4
+  psrlw                m3, 4
+  mova                 m4, m2
+  mova                 m5, m3
+  pmullw               m0, filter_y_a
+  pmullw               m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m4, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, m2
+  paddw                m4, filter_rnd
+  mova                 m2, [dstq]
+  paddw                m4, m3
+  psrlw                m0, 4
+  psrlw                m4, 4
+  mova                 m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+  pavgw                m0, [secq]
+  add                secq, sec_str
+  pavgw                m4, [secq]
+%endif
+  SUM_SSE              m0, m2, m4, m3, m6, m7
+  mova                 m0, m5
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000000..3c3253bdf9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+                                    const uint16_t *src, ptrdiff_t src_stride,
+                                    const uint16_t *pred,
+                                    ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+  u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *)(diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *)(diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *)(diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *)(diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *)(diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *)(diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *)(diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+#define STACK_V(h, fun)                                                        \
+  do {                                                                         \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);                \
+    fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+        pred + pred_stride * h, pred_stride);                                  \
+  } while (0)
+
+#define STACK_H(w, fun)                                                     \
+  do {                                                                      \
+    fun(diff, diff_stride, src, src_stride, pred, pred_stride);             \
+    fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+  } while (0)
+
+#define SUBTRACT_FUN(size)                                               \
+  static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride,      \
+                              const uint16_t *src, ptrdiff_t src_stride, \
+                              const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  if (rows == 4) {
+    if (cols == 4) return subtract_4x4;
+    if (cols == 8) return subtract_8x4;
+    if (cols == 16) return subtract_16x4;
+  }
+  if (rows == 8) {
+    if (cols == 4) return subtract_4x8;
+    if (cols == 8) return subtract_8x8;
+    if (cols == 16) return subtract_16x8;
+    if (cols == 32) return subtract_32x8;
+  }
+  if (rows == 16) {
+    if (cols == 4) return subtract_4x16;
+    if (cols == 8) return subtract_8x16;
+    if (cols == 16) return subtract_16x16;
+    if (cols == 32) return subtract_32x16;
+    if (cols == 64) return subtract_64x16;
+  }
+  if (rows == 32) {
+    if (cols == 8) return subtract_8x32;
+    if (cols == 16) return subtract_16x32;
+    if (cols == 32) return subtract_32x32;
+    if (cols == 64) return subtract_64x32;
+  }
+  if (rows == 64) {
+    if (cols == 16) return subtract_16x64;
+    if (cols == 32) return subtract_32x64;
+    if (cols == 64) return subtract_64x64;
+    if (cols == 128) return subtract_128x64;
+  }
+  if (rows == 128) {
+    if (cols == 64) return subtract_64x128;
+    if (cols == 128) return subtract_128x128;
+  }
+  assert(0);
+  return NULL;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+                                    ptrdiff_t diff_stride, const uint8_t *src8,
+                                    ptrdiff_t src_stride, const uint8_t *pred8,
+                                    ptrdiff_t pred_stride) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 0000000000..b4ff91d856
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,904 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum);
+
+static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+    const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
+    unsigned int output_height, unsigned int output_width,
+    const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
+    int dst_stride, uint32_t *sse) {
+  const __m256i filter1 =
+      _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
+                        bilinear_filters_2t[xoffset][0]);
+  const __m256i filter2 =
+      _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
+                        bilinear_filters_2t[yoffset][0]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const int bitshift = 0x40;
+  (void)pixel_step;
+  unsigned int i, j, prev = 0, curr = 2;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
+  uint16_t *src_ptr_ref = src_ptr;
+  uint16_t *dst_ptr_ref = dst_ptr;
+  int64_t sum_long = 0;
+  uint64_t sse_long = 0;
+  unsigned int rshift = 0, inc = 1;
+  __m256i rbias = _mm256_set1_epi32(bitshift);
+  __m256i opointer[8];
+  unsigned int range;
+  if (xoffset == 0) {
+    if (yoffset == 0) {  // xoffset==0 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+        }
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+        for (i = 0; i < 16 / inc; ++i) {
+          __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==0 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==0 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+          src_ptr += src_pixels_per_line;
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (xoffset == 4) {
+    if (yoffset == 0) {  // xoffset==4 && yoffset==0
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else if (yoffset == 4) {  // xoffset==4 && yoffset==4
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+          __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+
+    } else {  // xoffset==4 && yoffset==1,2,3,5,6,7
+      range = output_width / 16;
+      if (output_height == 8) inc = 2;
+      if (output_height == 4) inc = 4;
+      for (j = 0; j < range * output_height * inc / 16; j++) {
+        if (j % (output_height * inc / 16) == 0) {
+          src_ptr = src_ptr_ref;
+          src_ptr_ref += 16;
+          dst_ptr = dst_ptr_ref;
+          dst_ptr_ref += 16;
+
+          __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+          curr = 0;
+        }
+
+        __m256i sum1 = _mm256_setzero_si256();
+        __m256i sse1 = _mm256_setzero_si256();
+
+        for (i = 0; i < 16 / inc; ++i) {
+          prev = curr;
+          curr = (curr == 0) ? 1 : 0;
+          __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+          __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+          src_ptr += src_pixels_per_line;
+          opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+          __m256i V_S_M1 =
+              _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+          __m256i V_S_M2 =
+              _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+          __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+          __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+          __m256i V_S_S1 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+          __m256i V_S_S2 =
+              _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+          __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+          __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+          dst_ptr += dst_stride;
+
+          __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+          __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+          sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+          sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+        }
+
+        __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+        __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+        __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+        __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+        const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+        const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+        __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+        v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+        sum_long += _mm_extract_epi32(v_d, 0);
+        sse_long += _mm_extract_epi32(v_d, 1);
+      }
+
+      rshift = get_msb(output_height) + get_msb(output_width);
+    }
+  } else if (yoffset == 0) {  // xoffset==1,2,3,5,6,7 && yoffset==0
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+        __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else if (yoffset == 4) {  // xoffset==1,2,3,5,6,7 && yoffset==4
+
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < 16 / inc; ++i) {
+        prev = curr;
+        curr = (curr == 0) ? 1 : 0;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+
+  } else {  // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
+    range = output_width / 16;
+    if (output_height == 8) inc = 2;
+    if (output_height == 4) inc = 4;
+    unsigned int nloop = 16 / inc;
+    for (j = 0; j < range * output_height * inc / 16; j++) {
+      if (j % (output_height * inc / 16) == 0) {
+        src_ptr = src_ptr_ref;
+        src_ptr_ref += 16;
+        dst_ptr = dst_ptr_ref;
+        dst_ptr_ref += 16;
+
+        __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+
+        __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+        __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+        __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+        __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+        __m256i V_H_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+        __m256i V_H_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+        opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+        curr = 0;
+      }
+
+      __m256i sum1 = _mm256_setzero_si256();
+      __m256i sse1 = _mm256_setzero_si256();
+
+      for (i = 0; i < nloop; ++i) {
+        prev = curr;
+        curr = !curr;
+        __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+        __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+        src_ptr += src_pixels_per_line;
+        __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+        __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+        __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+        __m256i V_V_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+        __m256i V_V_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+        opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+        __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+        __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+        __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+        __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+        __m256i V_S_S1 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+        __m256i V_S_S2 =
+            _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+        __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+        __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+        dst_ptr += dst_stride;
+
+        __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+        __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+        sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+        sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+      }
+
+      __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+      __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+      __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+      __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+      const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+      const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+      __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+      v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+      sum_long += _mm_extract_epi32(v_d, 0);
+      sse_long += _mm_extract_epi32(v_d, 1);
+    }
+
+    rshift = get_msb(output_height) + get_msb(output_width);
+  }
+
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+  int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+
+  int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
+
+  return (var > 0) ? var : 0;
+}
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+                                const uint16_t *ref, int ref_stride,
+                                uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  for (int i = 0; i < 8; i += 2) {
+    const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+    const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+    const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+    const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+    __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+    __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+    v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+    v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride * 2;
+    ref += ref_stride * 2;
+  }
+  __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+  __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+  __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+                                  const uint16_t *ref, int ref_stride,
+                                  uint32_t *sse, int *sum) {
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+  const __m256i one = _mm256_set1_epi16(1);
+  for (int i = 0; i < 16; ++i) {
+    const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+    const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+    const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+    const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+    v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+    v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+  __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+  __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+  __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+  const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+  const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+  __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+  v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+  *sum = _mm_extract_epi32(v_d, 0);
+  *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_avx2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(16, 64, 16, 10)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 32, 8, 8)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef VAR_FN
+
+#define SSE2_HEIGHT(H)                                                 \
+  uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
+
+SSE2_HEIGHT(8)
+SSE2_HEIGHT(16)
+
+#undef SSE2_Height
+
+#define HIGHBD_SUBPIX_VAR(W, H)                                              \
+  uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,          \
+      const uint8_t *dst, int dst_stride, uint32_t *sse) {                   \
+    if (W == 8 && H == 16)                                                   \
+      return aom_highbd_10_sub_pixel_variance8x16_sse2(                      \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else if (W == 8 && H == 8)                                               \
+      return aom_highbd_10_sub_pixel_variance8x8_sse2(                       \
+          src, src_stride, xoffset, yoffset, dst, dst_stride, sse);          \
+    else                                                                     \
+      return aom_highbd_var_filter_block2d_bil_avx2(                         \
+          src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+  }
+
+HIGHBD_SUBPIX_VAR(128, 128)
+HIGHBD_SUBPIX_VAR(128, 64)
+HIGHBD_SUBPIX_VAR(64, 128)
+HIGHBD_SUBPIX_VAR(64, 64)
+HIGHBD_SUBPIX_VAR(64, 32)
+HIGHBD_SUBPIX_VAR(32, 64)
+HIGHBD_SUBPIX_VAR(32, 32)
+HIGHBD_SUBPIX_VAR(32, 16)
+HIGHBD_SUBPIX_VAR(16, 32)
+HIGHBD_SUBPIX_VAR(16, 16)
+HIGHBD_SUBPIX_VAR(16, 8)
+HIGHBD_SUBPIX_VAR(8, 16)
+HIGHBD_SUBPIX_VAR(8, 8)
+
+#undef HIGHBD_SUBPIX_VAR
+
+uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    dst1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m256i src0_8x16, src1_8x16, src_16x16;
+  __m256i dst0_8x16, dst1_8x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
+    dst1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
+    dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+    dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+    src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+    dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+    res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+    res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+    res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+    res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+    square_result = _mm256_add_epi64(
+        square_result,
+        _mm256_add_epi64(
+            _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_2x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(square_result),
+                    _mm256_extracti128_si256(square_result, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..ec6c7e9fa7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,318 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+globalsym(aom_highbd_calc16x16var_sse2)
+sym(aom_highbd_calc16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+16]
+        prefetcht0      [rsi+rax]
+        prefetcht0      [rsi+rax+16]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+16]
+        prefetcht0      [rdi+rdx]
+        prefetcht0      [rdi+rdx+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+.var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rax]
+        prefetcht0      [rbx+rax+16]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+16]
+        prefetcht0      [rbx+rdx]
+        prefetcht0      [rbx+rdx+16]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax+16]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx+16]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            2
+        jnz         .var16loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+globalsym(aom_highbd_calc8x8var_sse2)
+sym(aom_highbd_calc8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    SAVE_XMM 7
+    push rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+        add         rax,            rax ; source stride in bytes
+        add         rdx,            rdx ; recon stride in bytes
+
+        ; Prefetch data
+        prefetcht0      [rsi]
+        prefetcht0      [rsi+rax]
+        lea             rbx,    [rsi+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+
+        prefetcht0      [rdi]
+        prefetcht0      [rdi+rdx]
+        lea             rbx,    [rdi+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm0,           xmm0     ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7     ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6     ; clear xmm6 for accumulating sse
+        mov         rcx,            8
+
+.var8loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        lea             rbx,    [rsi+rax*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rbx+rax*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rax]
+        lea             rbx,    [rdi+rdx*4]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+        lea             rbx,    [rbx+rdx*2]
+        prefetcht0      [rbx]
+        prefetcht0      [rbx+rdx]
+
+        pxor        xmm5,           xmm5
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+
+        psubw       xmm3,           xmm2
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+        paddd       xmm6,           xmm3
+
+        psubw       xmm1,           xmm2
+        movdqu      xmm3,           XMMWORD PTR [rsi+rax]
+        paddw       xmm5,           xmm1
+        pmaddwd     xmm1,           xmm1
+        movdqu      xmm2,           XMMWORD PTR [rdi+rdx]
+        paddd       xmm6,           xmm1
+
+        psubw       xmm3,           xmm2
+        paddw       xmm5,           xmm3
+        pmaddwd     xmm3,           xmm3
+        paddd       xmm6,           xmm3
+
+        movdqa      xmm1,           xmm5
+        movdqa      xmm2,           xmm5
+        pcmpgtw     xmm1,           xmm0
+        pcmpeqw     xmm2,           xmm0
+        por         xmm1,           xmm2
+        pcmpeqw     xmm1,           xmm0
+        movdqa      xmm2,           xmm5
+        punpcklwd   xmm5,           xmm1
+        punpckhwd   xmm2,           xmm1
+        paddd       xmm7,           xmm5
+        paddd       xmm7,           xmm2
+
+        lea         rsi,            [rsi + 2*rax]
+        lea         rdi,            [rdi + 2*rdx]
+        sub         rcx,            4
+        jnz         .var8loop
+
+        movdqa      xmm4,           xmm6
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm4,           xmm0
+        movdqa      xmm5,           xmm7
+
+        paddd       xmm6,           xmm4
+        punpckldq   xmm7,           xmm0
+
+        punpckhdq   xmm5,           xmm0
+        paddd       xmm7,           xmm5
+
+        movdqa      xmm4,           xmm6
+        movdqa      xmm5,           xmm7
+
+        psrldq      xmm4,           8
+        psrldq      xmm5,           8
+
+        paddd       xmm6,           xmm4
+        paddd       xmm7,           xmm5
+
+        mov         rdi,            arg(4)   ; [SSE]
+        mov         rax,            arg(5)   ; [Sum]
+
+        movd DWORD PTR [rdi],       xmm6
+        movd DWORD PTR [rax],       xmm7
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    pop rbx
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..e897aab645
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+                                       const uint16_t *ref, int ref_stride,
+                                       uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride, int w,
+                                   int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride, int w,
+                                    int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int32_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+             ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define VAR_FN(w, h, block_size, shift)                                    \
+  uint32_t aom_highbd_8_variance##w##x##h##_sse2(                          \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_8_variance_sse2(                                                \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_10_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_10_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  uint32_t aom_highbd_12_variance##w##x##h##_sse2(                         \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
+      int ref_stride, uint32_t *sse) {                                     \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
+    highbd_12_variance_sse2(                                               \
+        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
+        aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(64, 16, 16, 10)
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                        const uint8_t *ref8, int ref_stride,
+                                        unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                         aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          aom_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                         aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+                          aom_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_variance##w##xh_##opt(                            \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, int height,                 \
+      unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+  DECL(8, opt)     \
+  DECL(16, opt)
+
+DECLS(sse2)
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt(                   \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = 0;                                                                \
+    unsigned int sse = 0;                                                      \
+    unsigned int sse2;                                                         \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2,      \
+          NULL, NULL);                                                         \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          sse += sse2;                                                         \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int se = 0;                                                                \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                            \
+      src += wd_64 * 64;                                                       \
+      dst += wd_64 * 64;                                                       \
+      int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+          src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+          NULL);                                                               \
+      se += se2;                                                               \
+      long_sse += sse;                                                         \
+      if (w > wf) {                                                            \
+        uint32_t sse2;                                                         \
+        se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                     \
+            src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+            &sse2, NULL, NULL);                                                \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,      \
+              dst_stride, h, &sse2, NULL, NULL);                               \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt(                  \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) {                \
+    int start_row;                                                             \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    int64_t var;                                                               \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    int row_rep = (w > 64) ? 2 : 1;                                            \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      uint16_t *src_tmp = src + (start_row * src_stride);                      \
+      uint16_t *dst_tmp = dst + (start_row * dst_stride);                      \
+      for (int wd_64 = 0; wd_64 < row_rep; wd_64++) {                          \
+        src_tmp += wd_64 * 64;                                                 \
+        dst_tmp += wd_64 * 64;                                                 \
+        int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+            src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride,      \
+            height, &sse2, NULL, NULL);                                        \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf) {                                                          \
+          se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                   \
+              src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf,      \
+              dst_stride, height, &sse2, NULL, NULL);                          \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          if (w > wf * 2) {                                                    \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 2 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+            se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt(                 \
+                src_tmp + 3 * wf, src_stride, x_offset, y_offset,              \
+                dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL);      \
+            se += se2;                                                         \
+            long_sse += sse2;                                                  \
+          }                                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                         \
+  FN(128, 128, 16, 7, 7, opt, (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (int64_t))   \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))      \
+  FN(16, 4, 16, 4, 2, opt, (int64_t))    \
+  FN(8, 32, 8, 3, 5, opt, (int64_t))     \
+  FN(32, 8, 16, 5, 3, opt, (int64_t))    \
+  FN(16, 64, 16, 4, 6, opt, (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                         \
+  int aom_highbd_sub_pixel_avg_variance##w##xh_##opt(                        \
+      const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec,        \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,    \
+      void *unused);
+#define DECLS(opt) \
+  DECL(16, opt)    \
+  DECL(8, opt)
+
+DECLS(sse2)
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast)                                  \
+  uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt(               \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    *sse_ptr = sse;                                                            \
+    return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2));                \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                  \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+        NULL, NULL);                                                           \
+    if (w > wf) {                                                              \
+      uint32_t sse2;                                                           \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride,      \
+          sec + wf, w, h, &sse2, NULL, NULL);                                  \
+      se += se2;                                                               \
+      sse += sse2;                                                             \
+      if (w > wf * 2) {                                                        \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf,        \
+            dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf,        \
+            dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL);                \
+        se += se2;                                                             \
+        sse += sse2;                                                           \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 2);                                            \
+    sse = ROUND_POWER_OF_TWO(sse, 4);                                          \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }                                                                            \
+                                                                               \
+  uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt(              \
+      const uint8_t *src8, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr,                  \
+      const uint8_t *sec8) {                                                   \
+    int start_row;                                                             \
+    int64_t var;                                                               \
+    uint32_t sse;                                                              \
+    int se = 0;                                                                \
+    uint64_t long_sse = 0;                                                     \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);                                 \
+    uint16_t *sec = CONVERT_TO_SHORTPTR(sec8);                                 \
+    for (start_row = 0; start_row < h; start_row += 16) {                      \
+      uint32_t sse2;                                                           \
+      int height = h - start_row < 16 ? h - start_row : 16;                    \
+      int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+          src + (start_row * src_stride), src_stride, x_offset, y_offset,      \
+          dst + (start_row * dst_stride), dst_stride, sec + (start_row * w),   \
+          w, height, &sse2, NULL, NULL);                                       \
+      se += se2;                                                               \
+      long_sse += sse2;                                                        \
+      if (w > wf) {                                                            \
+        se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(                 \
+            src + wf + (start_row * src_stride), src_stride, x_offset,         \
+            y_offset, dst + wf + (start_row * dst_stride), dst_stride,         \
+            sec + wf + (start_row * w), w, height, &sse2, NULL, NULL);         \
+        se += se2;                                                             \
+        long_sse += sse2;                                                      \
+        if (w > wf * 2) {                                                      \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 2 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+          se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt(               \
+              src + 3 * wf + (start_row * src_stride), src_stride, x_offset,   \
+              y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride,   \
+              sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL);   \
+          se += se2;                                                           \
+          long_sse += sse2;                                                    \
+        }                                                                      \
+      }                                                                        \
+    }                                                                          \
+    se = ROUND_POWER_OF_TWO(se, 4);                                            \
+    sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8);                           \
+    *sse_ptr = sse;                                                            \
+    var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2));                \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
+  }
+
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))    \
+  FN(16, 4, 16, 4, 2, opt, (int64_t))  \
+  FN(8, 32, 8, 3, 5, opt, (int64_t))   \
+  FN(32, 8, 16, 5, 3, opt, (int64_t))  \
+  FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+  FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+                                                    const __m128i *w0,
+                                                    const __m128i *w1,
+                                                    const __m128i *r,
+                                                    void *const result) {
+  assert(DIST_PRECISION_BITS <= 4);
+  __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+  __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+  __m128i sum = _mm_adds_epu16(mult0, mult1);
+  __m128i round = _mm_adds_epu16(sum, *r);
+  __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, shift);
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+    uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+    const uint8_t *ref8, int ref_stride,
+    const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i;
+  const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+  const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+  const __m128i w0 = _mm_set1_epi16(wt0);
+  const __m128i w1 = _mm_set1_epi16(wt1);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+  if (width >= 8) {
+    // Read 8 pixels one row at a time
+    assert(!(width & 7));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 8) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+        comp_pred += 8;
+        pred += 8;
+        ref += 8;
+      }
+      ref += ref_stride - width;
+    }
+  } else {
+    // Read 4 pixels two rows at a time
+    assert(!(width & 3));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+      comp_pred += 8;
+      pred += 8;
+      ref += 2 * ref_stride;
+    }
+  }
+}
+
+uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i reg0_4x16, reg1_4x16;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i src_8x16;
+  __m128i dst_8x16;
+  __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+  __m128i sub_result_8x16;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
+    src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+    res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+    res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+    res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+    res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+    res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+    res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+    res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+    res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+    square_result = _mm_add_epi64(
+        square_result,
+        _mm_add_epi64(
+            _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+            res3_4x64));
+  }
+
+  const __m128i sum_1x64 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                       uint16_t *src, int sstride, int w,
+                                       int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000000..df5449a9df
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum, diff;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  diff = (int64_t)*sse - ((sum * sum) >> 4);
+  return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+                                   dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+                                  sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int src_stride, int xoffset, int yoffset,
+    const uint8_t *dst, int dst_stride, uint32_t *sse,
+    const uint8_t *second_pred) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  aom_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+                                                bilinear_filters_2t[yoffset]);
+
+  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+                                   dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..0eb632326b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -0,0 +1,608 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4:  times 8 dw 4
+pw_8:  times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4:  times 8 dw 2
+pw2_8:  times 8 dw 4
+pw2_16:  times 8 dw 8
+pw2_32:  times 8 dw 16
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  movd                  m2, [leftq]
+  movd                  m0, [aboveq]
+  pxor                  m1, m1
+  punpckldq             m0, m2
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw_4)]
+  psraw                 m0, 3
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [leftq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movd                  m0, [aboveq]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_4)]
+  psraw                 m0, 2
+  pshuflw               m0, m0, 0x0
+  packuswb              m0, m0
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  movq                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_8)]
+  psraw                 m0, 4
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+  movifnidn          leftq, leftmp
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  movq                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  psadbw                m0, m1
+  paddw                 m0, [GLOBAL(pw2_8)]
+  psraw                 m0, 3
+  punpcklbw             m0, m0
+  pshuflw               m0, m0, 0x0
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movd     m0,        [GLOBAL(dc_128)]
+  movd    [dstq          ], m0
+  movd    [dstq+strideq  ], m0
+  movd    [dstq+strideq*2], m0
+  movd    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    m0,        [GLOBAL(dc_128)]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_16)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  psadbw                m0, m1
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_16)]
+  psraw                 m0, 4
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 4
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  mova                  m3, [leftq]
+  mova                  m4, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  psadbw                m3, m1
+  psadbw                m4, m1
+  paddw                 m0, m2
+  paddw                 m0, m3
+  paddw                 m0, m4
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw_32)]
+  psraw                 m0, 6
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [aboveq]
+  mova                  m2, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  pxor                  m1, m1
+  mova                  m0, [leftq]
+  mova                  m2, [leftq+16]
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  psadbw                m0, m1
+  psadbw                m2, m1
+  paddw                 m0, m2
+  movhlps               m2, m0
+  paddw                 m0, m2
+  paddw                 m0, [GLOBAL(pw2_32)]
+  psraw                 m0, 5
+  pshuflw               m0, m0, 0x0
+  punpcklqdq            m0, m0
+  packuswb              m0, m0
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+
+  RESTORE_GOT
+  REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+  GET_GOT     goffsetq
+
+  DEFINE_ARGS dst, stride, stride3, lines4
+  lea             stride3q, [strideq*3]
+  mov              lines4d, 8
+  mova    m0,        [GLOBAL(dc_128)]
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m0
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m0
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m0
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec              lines4d
+  jnz .loop
+  RESTORE_GOT
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+  movd                  m0, [aboveq]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  lea                 dstq, [dstq+strideq*2]
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+  movq                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3
+  lea             stride3q, [strideq*3]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  movq    [dstq          ], m0
+  movq    [dstq+strideq  ], m0
+  movq    [dstq+strideq*2], m0
+  movq    [dstq+stride3q ], m0
+  RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+  mova                  m0, [aboveq]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 4
+.loop:
+  mova    [dstq          ], m0
+  mova    [dstq+strideq  ], m0
+  mova    [dstq+strideq*2], m0
+  mova    [dstq+stride3q ], m0
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+  mova                  m0, [aboveq]
+  mova                  m1, [aboveq+16]
+  DEFINE_ARGS dst, stride, stride3, nlines4
+  lea             stride3q, [strideq*3]
+  mov              nlines4d, 8
+.loop:
+  mova [dstq             ], m0
+  mova [dstq          +16], m1
+  mova [dstq+strideq     ], m0
+  mova [dstq+strideq  +16], m1
+  mova [dstq+strideq*2   ], m0
+  mova [dstq+strideq*2+16], m1
+  mova [dstq+stride3q    ], m0
+  mova [dstq+stride3q +16], m1
+  lea                 dstq, [dstq+strideq*4]
+  dec             nlines4d
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0
+  pshufd                m1, m0, 0x1
+  movd      [dstq        ], m0
+  movd      [dstq+strideq], m1
+  pshufd                m2, m0, 0x2
+  lea                 dstq, [dstq+strideq*2]
+  pshufd                m3, m0, 0x3
+  movd      [dstq        ], m2
+  movd      [dstq+strideq], m3
+  RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -2
+  DEFINE_ARGS  dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+  movq                  m0, [leftq    ]
+  punpcklbw             m0, m0              ; l1 l1 l2 l2 ... l8 l8
+.loop:
+  pshuflw               m1, m0, 0x0         ; l1 l1 l1 l1 l1 l1 l1 l1
+  pshuflw               m2, m0, 0x55        ; l2 l2 l2 l2 l2 l2 l2 l2
+  movq      [dstq        ], m1
+  movq      [dstq+strideq], m2
+  pshuflw               m1, m0, 0xaa
+  pshuflw               m2, m0, 0xff
+  movq    [dstq+strideq*2], m1
+  movq    [dstq+stride3q ], m2
+  pshufd                m0, m0, 0xe         ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+  inc                lineq
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+  movifnidn          leftq, leftmp
+  mov                lineq, -4
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea             stride3q, [strideq*3]
+.loop:
+  movd                  m0, [leftq]
+  punpcklbw             m0, m0
+  punpcklbw             m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd            m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd            m2, m0, 0x55            ; l2 repeated 16 times
+  mova    [dstq          ], m1
+  mova    [dstq+strideq  ], m2
+  pshufd            m1, m0, 0xaa
+  pshufd            m2, m0, 0xff
+  mova    [dstq+strideq*2], m1
+  mova    [dstq+stride3q ], m2
+  inc                lineq
+  lea                leftq, [leftq+4       ]
+  lea                 dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+  movifnidn              leftq, leftmp
+  mov                    lineq, -8
+  DEFINE_ARGS dst, stride, line, left, stride3
+  lea                 stride3q, [strideq*3]
+.loop:
+  movd                      m0, [leftq]
+  punpcklbw                 m0, m0
+  punpcklbw                 m0, m0              ; l1 to l4 each repeated 4 times
+  pshufd                m1, m0, 0x0             ; l1 repeated 16 times
+  pshufd                m2, m0, 0x55            ; l2 repeated 16 times
+  mova     [dstq             ], m1
+  mova     [dstq+16          ], m1
+  mova     [dstq+strideq     ], m2
+  mova     [dstq+strideq+16  ], m2
+  pshufd                m1, m0, 0xaa
+  pshufd                m2, m0, 0xff
+  mova     [dstq+strideq*2   ], m1
+  mova     [dstq+strideq*2+16], m1
+  mova     [dstq+stride3q    ], m2
+  mova     [dstq+stride3q+16 ], m2
+  inc                    lineq
+  lea                    leftq, [leftq+4       ]
+  lea                     dstq, [dstq+strideq*4]
+  jnz .loop
+  REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 0000000000..242a548df9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,4707 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y0 = _mm256_sad_epu8(x0, zero);
+  __m256i y1 = _mm256_sad_epu8(x1, zero);
+  y0 = _mm256_add_epi64(y0, y1);
+  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+  y0 = _mm256_add_epi64(u0, y0);
+  u0 = _mm256_unpackhi_epi64(y0, y0);
+  return _mm256_add_epi16(y0, u0);
+}
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i y = _mm256_sad_epu8(x, zero);
+  __m256i u = _mm256_permute2x128_si256(y, y, 1);
+  y = _mm256_add_epi64(u, y);
+  u = _mm256_unpackhi_epi64(y, y);
+  return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
+    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
+    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+    0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+    0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+    0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+
+  r0 = _mm_unpacklo_epi16(x[0], x[1]);
+  r1 = _mm_unpacklo_epi16(x[2], x[3]);
+  r2 = _mm_unpacklo_epi16(x[4], x[5]);
+  r3 = _mm_unpacklo_epi16(x[6], x[7]);
+
+  r4 = _mm_unpacklo_epi16(x[8], x[9]);
+  r5 = _mm_unpacklo_epi16(x[10], x[11]);
+  r6 = _mm_unpacklo_epi16(x[12], x[13]);
+  r7 = _mm_unpacklo_epi16(x[14], x[15]);
+
+  r8 = _mm_unpacklo_epi32(r0, r1);
+  r9 = _mm_unpackhi_epi32(r0, r1);
+  r10 = _mm_unpacklo_epi32(r2, r3);
+  r11 = _mm_unpackhi_epi32(r2, r3);
+
+  r12 = _mm_unpacklo_epi32(r4, r5);
+  r13 = _mm_unpackhi_epi32(r4, r5);
+  r14 = _mm_unpacklo_epi32(r6, r7);
+  r15 = _mm_unpackhi_epi32(r6, r7);
+
+  r0 = _mm_unpacklo_epi64(r8, r9);
+  r1 = _mm_unpackhi_epi64(r8, r9);
+  r2 = _mm_unpacklo_epi64(r10, r11);
+  r3 = _mm_unpackhi_epi64(r10, r11);
+
+  r4 = _mm_unpacklo_epi64(r12, r13);
+  r5 = _mm_unpackhi_epi64(r12, r13);
+  r6 = _mm_unpacklo_epi64(r14, r15);
+  r7 = _mm_unpackhi_epi64(r14, r15);
+
+  d[0] = _mm_unpacklo_epi64(r0, r2);
+  d[1] = _mm_unpacklo_epi64(r4, r6);
+  d[2] = _mm_unpacklo_epi64(r1, r3);
+  d[3] = _mm_unpacklo_epi64(r5, r7);
+
+  d[4] = _mm_unpackhi_epi64(r0, r2);
+  d[5] = _mm_unpackhi_epi64(r4, r6);
+  d[6] = _mm_unpackhi_epi64(r1, r3);
+  d[7] = _mm_unpackhi_epi64(r5, r7);
+}
+
+static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, ww0, ww1;
+  __m256i dd[16];
+  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
+  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
+
+  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
+  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
+
+  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
+  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
+  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
+  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
+  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
+  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
+  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
+
+  ww0 = _mm256_unpacklo_epi32(w0, w1);
+  ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  ww0 = _mm256_unpackhi_epi32(w0, w1);
+  ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
+  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
+
+  for (int i = 0; i < 8; i++) {
+    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
+    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
+                                       _mm256_extracti128_si256(dd[i], 1), 0);
+  }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm256_srai_epi16(sum_left, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+                                        ptrdiff_t stride) {
+  __m256i t[4];
+  __m256i m = _mm256_setzero_si256();
+  const __m256i inc = _mm256_set1_epi8(4);
+  int i;
+
+  for (i = 0; i < 4; i++) {
+    t[i] = _mm256_shuffle_epi8(*row, m);
+    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+    _mm256_storeu_si256((__m256i *)dst, r0);
+    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+    dst += stride;
+    m = _mm256_add_epi8(m, inc);
+  }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+  __m256i v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  u = _mm256_unpackhi_epi8(left_col, left_col);
+
+  v = _mm256_unpacklo_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+  dst += stride << 2;
+
+  v = _mm256_unpackhi_epi8(u, u);
+  h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i top_sum = dc_sum_32_sse2(above);
+  __m128i left_sum = dc_sum_16_sse2(left);
+  left_sum = _mm_add_epi16(top_sum, left_sum);
+  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
+  sum += 24;
+  sum /= 48;
+  const __m256i row = _mm256_set1_epi8((int8_t)sum);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_32(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((int8_t)sum);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_64(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 64;
+  sum /= 128;
+  const __m256i row = _mm256_set1_epi8((int8_t)sum);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = dc_sum_32(left);
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 48;
+  sum /= 96;
+  const __m256i row = _mm256_set1_epi8((int8_t)sum);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m256i sum_above = dc_sum_64(above);
+  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+  sum_left = _mm256_add_epi16(sum_left, sum_above);
+  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+  sum += 40;
+  sum /= 80;
+  const __m256i row = _mm256_set1_epi8((int8_t)sum);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_64(left);
+  (void)above;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m256i sum = dc_sum_32(left);
+  (void)above;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  __m128i sum = dc_sum_16_sse2(left);
+  (void)above;
+
+  const __m128i eight = _mm_set1_epi16(8);
+  sum = _mm_add_epi16(sum, eight);
+  sum = _mm_srai_epi16(sum, 4);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i r = _mm_shuffle_epi8(sum, zero);
+  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+                                 const __m256i *topleft) {
+  const __m256i base =
+      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+  pl = _mm256_andnot_si256(mask1, *left);
+
+  ptl = _mm256_and_si256(mask2, *topleft);
+  pt = _mm256_andnot_si256(mask2, *top);
+  pt = _mm256_or_si256(pt, ptl);
+  pt = _mm256_and_si256(mask1, pt);
+
+  return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+                                      const __m256i *topleft) {
+  const __m256i p0 = paeth_pred(left, top, topleft);
+  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i p = _mm256_packus_epi16(p0, p1);
+  return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+  const __m128i x = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i x = _mm_loadl_epi64((const __m128i *)left);
+  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+  const __m128i x = _mm_load_si128((const __m128i *)left);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+                                      const __m256i *top1,
+                                      const __m256i *topleft) {
+  __m256i p0 = paeth_pred(left, top0, topleft);
+  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+  p0 = paeth_pred(left, top1, topleft);
+  p1 = _mm256_permute4x64_epi64(p0, 0xe);
+  const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+  return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+    _mm256_storeu_si256((__m256i *)dst, r);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m256i l = get_left_vector(left);
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+
+  l = get_left_vector(left + 16);
+  rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i t0 = get_top_vector(above);
+  const __m256i t1 = get_top_vector(above + 16);
+  const __m256i t2 = get_top_vector(above + 32);
+  const __m256i t3 = get_top_vector(above + 48);
+  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  int i;
+  const __m256i l = get_left_vector(left);
+  __m256i rep = _mm256_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+
+    dst += stride;
+    rep = _mm256_add_epi16(rep, one);
+  }
+}
+
+#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
+#define PERM2x128(c0, c1) c0 + (c1 << 4)
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+  __m128i a0_128, a1_128;
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+                                   base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+              _mm256_set1_epi16(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+                                   base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_128);
+    a1 = _mm256_castsi128_si256(a1_128);
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+    res1 = _mm256_castsi256_si128(res);
+
+    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff;
+  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+  max_base_x128 = _mm_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_castsi256_si128(res);
+    res1 = _mm_packus_epi32(res1, res1);
+
+    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
+    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
+    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a0_1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi32(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    if (upsample_above) {
+      a0 = _mm256_permutevar8x32_epi32(
+          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a0_1 = _mm256_permutevar8x32_epi32(
+          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+      base_inc256 =
+          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
+                            base + 10, base + 12, base + 14);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(
+              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+              _mm256_set1_epi32(0x3f)),
+          1);
+    } else {
+      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7);
+      shift = _mm256_srli_epi32(
+          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+    }
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    res1 = _mm256_packus_epi32(
+        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
+    mask256 = _mm256_packs_epi32(
+        mask256, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
+    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res1, shift;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
+      }
+      return;
+    }
+
+    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+    if (upsample_above) {
+      __m128i mask, atmp0, atmp1, atmp2, atmp3;
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+      atmp2 =
+          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      atmp3 =
+          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+      mask =
+          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+                            _mm_set1_epi8(15));
+      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+                                      base + 8, base + 10, base + 12, base + 14,
+                                      0, 0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                      base + 4, base + 5, base + 6, base + 7, 0,
+                                      0, 0, 0, 0, 0, 0, 0);
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_castsi128_si256(a0_x128);
+    a1 = _mm256_castsi128_si256(a1_x128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    dst[r] = _mm256_castsi256_si128(res1);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
+                                             ptrdiff_t stride,
+                                             const uint16_t *above,
+                                             int upsample_above, int dx,
+                                             int bd) {
+  __m128i dstvec[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+                                              dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+                                                    upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi32(diff, shift);
+
+    res[0] = _mm256_add_epi32(a32, b);
+    res[0] = _mm256_srli_epi32(res[0], 5);
+    res[0] = _mm256_packus_epi32(
+        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+    int mdif = max_base_x - base;
+    if (mdif > 8) {
+      a0_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+      a1_1 =
+          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+      b = _mm256_mullo_epi32(diff, shift);
+
+      res[1] = _mm256_add_epi32(a32, b);
+      res[1] = _mm256_srli_epi32(res[1], 5);
+      res[1] = _mm256_packus_epi32(
+          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+    } else {
+      res[1] = a_mbase_x;
+    }
+    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                   1);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((16 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 16 values
+      }
+      return;
+    }
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    a0 = _mm256_loadu_si256((__m256i *)(above + base));
+    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+    b = _mm256_mullo_epi16(diff, shift);
+
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
+
+    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+                                    base + 4, base + 5, base + 6, base + 7,
+                                    base + 8, base + 9, base + 10, base + 11,
+                                    base + 12, base + 13, base + 14, base + 15);
+    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[64];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res1 = a_mbase_x;
+      } else {
+        a0 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + j)));
+        a1 = _mm256_cvtepu16_epi32(
+            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+          a1_1 = _mm256_cvtepu16_epi32(
+              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res1;
+      } else {
+        dstvec[r + N] = res1;
+      }
+    }
+    x += dx;
+  }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec[i + N] = a_mbase_x;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 32; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        res = a_mbase_x;
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+      }
+      if (!j) {
+        dstvec[r] = res;
+      } else {
+        dstvec[r + N] = res;
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx,
+                                              int bd) {
+  __m256i dstvec[128];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+                                               dx);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+                                                     upsample_above, dx);
+  }
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
+  }
+}
+
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+                                                    ptrdiff_t stride,
+                                                    const uint16_t *above,
+                                                    int upsample_above,
+                                                    int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a0_1, a1, a1_1, a32, a16;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi32(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res[2], res1;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift = _mm256_srli_epi32(
+        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu16_epi32(a0_128);
+        a1 = _mm256_cvtepu16_epi32(a1_128);
+
+        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        res[0] = _mm256_add_epi32(a32, b);
+        res[0] = _mm256_srli_epi32(res[0], 5);
+        res[0] = _mm256_packus_epi32(
+            res[0],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+        if (mdif > 8) {
+          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
+          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
+
+          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
+          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
+          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+          b = _mm256_mullo_epi32(diff, shift);
+
+          res[1] = _mm256_add_epi32(a32, b);
+          res[1] = _mm256_srli_epi32(res[1], 5);
+          res[1] = _mm256_packus_epi32(
+              res[1],
+              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+        } else {
+          res[1] = a_mbase_x;
+        }
+        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+                                       1);  // 16 16bit values
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res1);
+      }
+    }
+    x += dx;
+  }
+}
+
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+                                              ptrdiff_t stride,
+                                              const uint16_t *above,
+                                              int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16, c3f;
+  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+  max_base_x256 = _mm256_set1_epi16(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+      } else {
+        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+
+        base_inc256 = _mm256_setr_epi16(
+            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+            base + j + 13, base + j + 14, base + j + 15);
+
+        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int dx, int dy, int bd) {
+  (void)left;
+  (void)dy;
+
+  switch (bw) {
+    case 4:
+      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 8:
+      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
+                                       dx, bd);
+      break;
+    case 16:
+      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 32:
+      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
+                                        dx, bd);
+      break;
+    case 64:
+      if (bd < 12) {
+        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+                                          upsample_above, dx);
+      } else {
+        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+                                                upsample_above, dx);
+      }
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+                                      uint16_t *dst, ptrdiff_t pitchDst) {
+  __m256i r[16];
+  __m256i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+  }
+  highbd_transpose16x16_avx2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+                             uint16_t *dst, ptrdiff_t pitchDst, int width,
+                             int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                                dst + j * pitchDst + i, pitchDst);
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm_set1_epi32(0x3f);
+  min_base_y128 = _mm_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(
+                _mm_slli_epi32(
+                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi32(
+            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int, base_y_c[4]);
+      r6 = _mm_set1_epi32(r << 6);
+      dy128 = _mm_set1_epi32(dy);
+      c1234 = _mm_setr_epi32(1, 2, 3, 4);
+      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]]);
+      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi32(
+            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi32(diff, shift);
+    res = _mm256_add_epi32(a32, b);
+    res = _mm256_srli_epi32(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resx = _mm_packus_epi32(resx, resx);
+
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi32(resy, resy);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16;
+  __m256i diff;
+  __m128i c3f, min_base_y128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        a0_x128 = _mm_shuffle_epi8(a0_x128,
+                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                              (2 << 6) - y * dx,
+                                              (3 << 6) - y * dx, 0, 0, 0, 0),
+                               upsample_above),
+                c3f),
+            1));
+      } else {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                               (3 << 6) - y * dx, 0, 0, 0, 0),
+                c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+                            0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+  __m256i diff;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi32(16);
+  c3f = _mm256_set1_epi32(0x3f);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      resx = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_slli_epi32(
+                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1);
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(
+                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
+                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
+                                  (7 << 6) - y * dx),
+                c3f),
+            1);
+      }
+      a0_x = _mm256_cvtepu16_epi32(a0_x128);
+      a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    }
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int, base_y_c[8]);
+      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+      r6 = _mm256_set1_epi32(r << 6);
+      dy256 = _mm256_set1_epi32(dy);
+      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+          left[base_y_c[6]], left[base_y_c[7]]));
+      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+      if (upsample_left) {
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+            1);
+      } else {
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+      }
+      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi32(diff, shift);
+      res = _mm256_add_epi32(a32, b);
+      res = _mm256_srli_epi32(res, 5);
+
+      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+    } else {
+      resy = resx;
+    }
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i c3f, min_base_y128;
+  __m256i a0_x, a1_x, diff, a32, a16;
+  __m128i a0_x128, a1_x128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      if (upsample_above) {
+        __m128i mask, atmp0, atmp1, atmp2, atmp3;
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+        atmp0 = _mm_shuffle_epi8(a0_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp1 = _mm_shuffle_epi8(a1_x128,
+                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+        atmp2 = _mm_shuffle_epi8(
+            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        atmp3 = _mm_shuffle_epi8(
+            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+                              _mm_set1_epi8(15));
+        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+                              _mm_set1_epi8(15));
+        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(
+                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
+                    upsample_above),
+                c3f),
+            1));
+      } else {
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
+                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
+                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
+                          c3f),
+            1));
+      }
+      a0_x = _mm256_castsi128_si256(a0_x128);
+      a1_x = _mm256_castsi128_si256(a1_x128);
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      dy128 = _mm_set1_epi16(dy);
+      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm256_castsi256_si128(res);
+    resy = _mm256_extracti128_si256(res, 1);
+
+    resxy =
+        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+    _mm_storeu_si128((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
+  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+  DECLARE_ALIGNED(32, int, base_y_c[16]);
+
+  a16 = _mm256_set1_epi32(16);
+  c1 = _mm256_srli_epi32(a16, 4);
+  c8 = _mm256_srli_epi32(a16, 1);
+  min_base_y256 = _mm256_set1_epi32(min_base_y);
+  c3f = _mm256_set1_epi32(0x3f);
+  dy256 = _mm256_set1_epi32(dy);
+  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+  c1234 = _mm256_add_epi32(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, ydx;
+    __m256i resx[2], resy[2];
+    __m256i resxy, j256, r6;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi32(j);
+      int y = r + 1;
+      ydx = _mm256_set1_epi32(y * dx);
+
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - base_x - 1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift > 7) {
+        resx[0] = _mm256_setzero_si256();
+      } else {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu16_epi32(a0_x128);
+        a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resx[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+      }
+      int base_shift8 = 0;
+      if ((base_x + 8) < (min_base_x - 1)) {
+        base_shift8 = (min_base_x - (base_x + 8) - 1);
+      }
+      if (base_shift8 > 7) {
+        resx[1] = _mm256_setzero_si256();
+      } else {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+        r6 = _mm256_slli_epi32(
+            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
+        shift = _mm256_srli_epi32(
+            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
+        b = _mm256_mullo_epi32(diff, shift);
+
+        resx[1] = _mm256_add_epi32(a32, b);
+        resx[1] = _mm256_srli_epi32(resx[1], 5);
+        resx[1] = _mm256_packus_epi32(
+            resx[1],
+            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+      }
+      resx[0] =
+          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+                                  1);  // 16 16bit values
+
+      // y calc
+      resy[0] = _mm256_setzero_si256();
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
+        r6 = _mm256_set1_epi32(r << 6);
+        c256 = _mm256_add_epi32(j256, c1234);
+        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+        c256 = _mm256_add_epi32(c256, c8);
+        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]]));
+        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[0] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+            left[base_y_c[14]], left[base_y_c[15]]));
+        a1_y = _mm256_cvtepu16_epi32(
+            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi32(diff, shift);
+        res = _mm256_add_epi32(a32, b);
+        res = _mm256_srli_epi32(res, 5);
+
+        resy[1] = _mm256_packus_epi32(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+        resy[0] =
+            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+                                    1);  // 16 16bit values
+      }
+
+      resxy = _mm256_blendv_epi8(resx[0], resy[0],
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+    const uint16_t *left, int upsample_above, int upsample_left, int dx,
+    int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0_x, a1_x, a32, a16, c3f, c1;
+  __m256i diff, min_base_y256, dy256, c1234, c0123;
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift;
+    __m256i resx, resy, ydx;
+    __m256i resxy, j256, r6;
+    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((short)(y * dx));
+
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+      int base_shift = 0;
+      if ((base_x) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x)-1);
+      }
+      int base_min_diff = (min_base_x - base_x);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 8) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+        a1_x128 =
+            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+        a0_x = _mm256_castsi128_si256(a0_x128);
+        a1_x = _mm256_castsi128_si256(a1_x128);
+      } else {
+        a0_x = _mm256_setzero_si256();
+        a1_x = _mm256_setzero_si256();
+      }
+
+      int base_shift1 = 0;
+      if (base_shift > 8) {
+        base_shift1 = base_shift - 8;
+      }
+      if (base_shift1 < 8) {
+        a0_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
+        a1_1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
+        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
+
+        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+      }
+      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm256_mullo_epi16(diff, shift);
+      res = _mm256_add_epi16(a32, b);
+      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+
+      // y calc
+      resy = _mm256_setzero_si256();
+      __m256i a0_y, a1_y, shifty;
+      if ((base_x < min_base_x)) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a0_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+        a1_y = _mm256_setr_epi16(
+            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+            left[base_y_c[15]]);
+
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        resy = _mm256_srli_epi16(res, 5);
+      }
+
+      resxy = _mm256_blendv_epi8(resx, resy,
+                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
+      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy,
+                                      int bd) {
+  (void)bd;
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    case 8:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+    default:
+      if (bd < 12) {
+        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                         upsample_above, upsample_left, dx, dy);
+      } else {
+        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                               upsample_above, upsample_left,
+                                               dx, dy);
+      }
+      break;
+  }
+}
+
+//  Directional prediction, zone 3 functions
+static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
+                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+  return;
+}
+
+static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                           &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[4], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+                               &d[7]);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                             const uint16_t *left,
+                                             int upsample_left, int dy,
+                                             int bd) {
+  __m128i dstvec[8], d[4];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+                               &d[0], &d[1], &d[2], &d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[8], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose8x16_16x8_avx2(dstvec, d);
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_extracti128_si256(d[i - 8], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+  }
+}
+
+static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[4], d[4], d1;
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose4x16_avx2(dstvec, d);
+  for (int i = 0; i < 4; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+    d1 = _mm256_bsrli_epi128(d[i], 8);
+    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
+                     _mm256_castsi256_si128(d1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
+                     _mm256_extracti128_si256(d1, 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[16], d[8];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+                                                    upsample_left, dy);
+  }
+  highbd_transpose16x4_8x8_sse2(dstvec, d);
+
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
+  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
+  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
+  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
+}
+
+static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  for (int i = 0; i < 16; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 8; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
+                                              const uint16_t *left,
+                                              int upsample_left, int dy,
+                                              int bd) {
+  __m128i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+                                              dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+                                                    upsample_left, dy);
+  }
+
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+                             &d[5 + i], &d[6 + i], &d[7 + i]);
+  }
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
+  }
+}
+
+static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[16], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+
+  highbd_transpose16x16_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  highbd_transpose16x16_avx2(dstvec, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 16, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 32, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
+  }
+  highbd_transpose16x16_avx2(dstvec + 48, d);
+  for (int j = 0; j < 16; j++) {
+    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
+  }
+}
+
+static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 8) {
+    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+  }
+  // store
+  for (int j = 0; j < 32; j += 16) {
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
+                       _mm256_castsi256_si128(d[(i + j)]));
+    }
+    for (int i = 0; i < 8; i++) {
+      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
+                       _mm256_castsi256_si128(d[(i + j) + 8]));
+    }
+    for (int i = 8; i < 16; i++) {
+      _mm256_storeu_si256(
+          (__m256i *)(dst + (i + j) * stride),
+          _mm256_inserti128_si256(
+              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[32], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 32; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  uint16_t dstT[64 * 32];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
+  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
+  highbd_transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
+  if (bd < 12) {
+    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+                                            dy);
+  }
+  highbd_transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
+                                               const uint16_t *left,
+                                               int upsample_left, int dy,
+                                               int bd) {
+  __m256i dstvec[64], d[16];
+  if (bd < 12) {
+    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+                                               dy);
+  } else {
+    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+                                                     upsample_left, dy);
+  }
+  for (int i = 0; i < 64; i += 16) {
+    highbd_transpose16x16_avx2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+                                      int bh, const uint16_t *above,
+                                      const uint16_t *left, int upsample_left,
+                                      int dx, int dy, int bd) {
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy > 0);
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 8:
+        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+                                         bd);
+        break;
+      case 16:
+        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 32:
+        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+      case 64:
+        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+                                           bd);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
+                                             dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+          case 32:
+            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 8:
+            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
+                                              dy, bd);
+            break;
+          case 16:
+            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
+                                               dy, bd);
+            break;
+        }
+      }
+    }
+  }
+  return;
+}
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    __m256i b, res, shift;
+    __m128i res1, a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > H) base_max_diff = H;
+    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+      a1_128 = _mm_srli_si128(a0_128, 8);
+
+      shift = _mm256_srli_epi16(
+          _mm256_and_si256(
+              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+    }
+    a0 = _mm256_cvtepu8_epi16(a0_128);
+    a1 = _mm256_cvtepu8_epi16(a1_128);
+
+    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    res = _mm256_packus_epi16(
+        res, _mm256_castsi128_si256(
+                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
+    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, int upsample_above,
+                                      int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, res16[2];
+    __m128i a0_128, a1_128;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res16[jj] = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+      }
+    }
+    res16[1] =
+        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+                                1);  // 32 8bit values
+
+    dstvec[r] = _mm256_blendv_epi8(
+        a_mbase_x, res16[1],
+        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  __m256i dstvec[64];
+  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *above, int upsample_above,
+                                       int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i a0, a1, a32, a16;
+  __m256i a_mbase_x, diff, c3f;
+  __m128i max_base_x128, base_inc128, mask128;
+
+  a16 = _mm256_set1_epi16(16);
+  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+  max_base_x128 = _mm_set1_epi8(max_base_x);
+  c3f = _mm256_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m256i b, res;
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
+        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m256i shift =
+        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+    __m128i a0_128, a1_128, res128;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j),
+                         _mm256_castsi256_si128(a_mbase_x));
+      } else {
+        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+        a0 = _mm256_cvtepu8_epi16(a0_128);
+        a1 = _mm256_cvtepu8_epi16(a1_128);
+
+        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm256_mullo_epi16(diff, shift);
+
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);
+        res = _mm256_packus_epi16(
+            res, _mm256_castsi128_si256(
+                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
+
+        base_inc128 =
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+                                 _mm_setzero_si128());
+        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+                                 _mm256_castsi256_si128(res), mask128);
+        _mm_storeu_si128((__m128i *)(dst + j), res128);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: break;
+  }
+  return;
+}
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0_x, a1_x, a32, a16, diff;
+  __m128i c3f, min_base_y128, c1234, dy128;
+
+  a16 = _mm_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  dy128 = _mm_set1_epi16(dy);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, shift, r6, ydx;
+    __m128i resx, resy, resxy;
+    __m128i a0_x128, a1_x128;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_x128);
+      a1_x = _mm_cvtepu8_epi16(a1_x128);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c128, base_y_c128, mask128, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int upsample_above, int upsample_left,
+                                      int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m256i diff, a32, a16;
+  __m256i a0_x, a1_x;
+  __m128i a0_x128, a1_x128, min_base_y128, c3f;
+  __m128i c1234, dy128;
+
+  a16 = _mm256_set1_epi16(16);
+  c3f = _mm_set1_epi16(0x3f);
+  min_base_y128 = _mm_set1_epi16(min_base_y);
+  dy128 = _mm_set1_epi16(dy);
+  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+  for (int r = 0; r < N; r++) {
+    __m256i b, res, shift;
+    __m128i resx, resy, resxy, r6, ydx;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      a0_x = _mm256_setzero_si256();
+      a1_x = _mm256_setzero_si256();
+      shift = _mm256_setzero_si256();
+    } else {
+      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+      if (upsample_above) {
+        a0_x128 =
+            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+        shift = _mm256_castsi128_si256(_mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1));
+      } else {
+        a1_x128 = _mm_srli_si128(a0_x128, 1);
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm256_castsi128_si256(
+            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
+      }
+      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+    }
+
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i y_c128, base_y_c128, mask128;
+      r6 = _mm_set1_epi16(r << 6);
+      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      base_y_c128 = _mm_add_epi16(
+          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+      }
+
+      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+      shift = _mm256_inserti128_si256(shift, shifty, 1);
+    }
+
+    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm256_mullo_epi16(diff, shift);
+    res = _mm256_add_epi16(a32, b);
+    res = _mm256_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+                            _mm256_castsi256_si128(res));
+    resy = _mm256_extracti128_si256(res, 1);
+    resy = _mm_packus_epi16(resy, resy);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+    _mm_storel_epi64((__m128i *)(dst), resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+                                      ptrdiff_t stride, const uint8_t *above,
+                                      const uint8_t *left, int upsample_above,
+                                      int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+  __m128i a0_x128, a1_x128;
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+  a16 = _mm256_set1_epi16(16);
+  c1 = _mm256_srli_epi16(a16, 4);
+  min_base_y256 = _mm256_set1_epi16(min_base_y);
+  c3f = _mm256_set1_epi16(0x3f);
+  dy256 = _mm256_set1_epi16(dy);
+  c0123 =
+      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+  c1234 = _mm256_add_epi16(c0123, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m256i b, res, shift, j256, r6, ydx;
+    __m128i resx, resy;
+    __m128i resxy;
+    int y = r + 1;
+    ydx = _mm256_set1_epi16((int16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      j256 = _mm256_set1_epi16(j);
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_x128 =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm256_cvtepu8_epi16(a0_x128);
+        a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+        shift = _mm256_srli_epi16(
+            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shift);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resx = _mm_setzero_si128();
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        __m256i c256, y_c256, base_y_c256, mask256, mul16;
+        r6 = _mm256_set1_epi16(r << 6);
+        c256 = _mm256_add_epi16(j256, c1234);
+        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+                                 _mm256_srli_epi16(min_base_y256, 1));
+        y_c256 = _mm256_sub_epi16(r6, mul16);
+
+        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+
+        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+        int16_t min_y = (int16_t)_mm_extract_epi16(
+            _mm256_extracti128_si256(base_y_c256, 1), 7);
+        int16_t max_y =
+            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+          __m128i base_y_offset128 =
+              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+                              _mm256_extracti128_si256(base_y_offset, 1));
+
+          __m128i a0_y128 = _mm_maskload_epi32(
+              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          __m128i a1_y128 =
+              _mm_maskload_epi32((int *)(left + min_y + 1),
+                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
+          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+          a0_y = _mm256_cvtepu8_epi16(a0_y128);
+          a1_y = _mm256_cvtepu8_epi16(a1_y128);
+        } else {
+          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a0_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+          a1_y = _mm256_setr_epi16(
+              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+              left[base_y_c[15]]);
+        }
+        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm256_mullo_epi16(diff, shifty);
+        res = _mm256_add_epi16(a32, b);
+        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
+        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_above, int upsample_left, int dx,
+                               int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+                                upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+                                upsample_above, upsample_left, dx, dy);
+      break;
+  }
+  return;
+}
+
+// z3 functions
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m256i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm256_unpacklo_epi64(w6, w14);
+  d[1] = _mm256_unpackhi_epi64(w6, w14);
+  d[2] = _mm256_unpacklo_epi64(w7, w15);
+  d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm256_unpacklo_epi64(w6, w14);
+  d[5] = _mm256_unpackhi_epi64(w6, w14);
+  d[6] = _mm256_unpacklo_epi64(w7, w15);
+  d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm256_unpacklo_epi16(w0, w1);
+  w5 = _mm256_unpacklo_epi16(w2, w3);
+  w12 = _mm256_unpacklo_epi16(w8, w9);
+  w13 = _mm256_unpacklo_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm256_unpacklo_epi64(w6, w14);
+  d[9] = _mm256_unpackhi_epi64(w6, w14);
+  d[10] = _mm256_unpacklo_epi64(w7, w15);
+  d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+  w4 = _mm256_unpackhi_epi16(w0, w1);
+  w5 = _mm256_unpackhi_epi16(w2, w3);
+  w12 = _mm256_unpackhi_epi16(w8, w9);
+  w13 = _mm256_unpackhi_epi16(w10, w11);
+
+  w6 = _mm256_unpacklo_epi32(w4, w5);
+  w7 = _mm256_unpackhi_epi32(w4, w5);
+  w14 = _mm256_unpacklo_epi32(w12, w13);
+  w15 = _mm256_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm256_unpacklo_epi64(w6, w14);
+  d[13] = _mm256_unpackhi_epi64(w6, w14);
+  d[14] = _mm256_unpacklo_epi64(w7, w15);
+  d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *left, int upsample_left,
+                                      int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm256_setzero_si256();
+  }
+  transpose16x32_avx2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride),
+                     _mm256_castsi256_si128(d[i]));
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+                     _mm256_extracti128_si256(d[i], 1));
+  }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+                                       const uint8_t *left, int upsample_left,
+                                       int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[32], d[32];
+
+  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  transpose16x32_avx2(dstvec + 16, d + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+                     _mm256_castsi256_si128(d[j + 16]));
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+                     _mm256_extracti128_si256(d[j + 16], 1));
+  }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m256i dstvec[16], d[16];
+
+  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+  transpose16x32_avx2(dstvec, d);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride),
+                     _mm256_castsi256_si128(d[j]));
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+                     _mm256_extracti128_si256(d[j], 1));
+  }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                               const uint8_t *above, const uint8_t *left,
+                               int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+        break;
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+            break;
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 0000000000..61e29731c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  for (int i = 0; i < height; i += 2) {
+    *(uint32_t *)dst = dc;
+    dst += stride;
+    *(uint32_t *)dst = dc;
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_storel_epi64((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_unpacklo_epi8(x, zero);
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+  __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x2 = _mm_sad_epu8(x2, zero);
+  x3 = _mm_sad_epu8(x3, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  x2 = _mm_add_epi16(x2, x3);
+  x0 = _mm_add_epi16(x0, x2);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+                                              int multiplier) {
+  const int interm = num >> shift1;
+  return interm * multiplier >> DC_SHIFT2;
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 10;
+  sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_64(left);
+  __m128i sum_above = dc_sum_16_sse2(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_8(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 20;
+  sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_64(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 64;
+  sum /= 128;
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_32_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 48;
+  sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i sum_left = dc_sum_16_sse2(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  sum += 40;
+  sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+  const __m128i row = _mm_set1_epi8((int8_t)sum);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16(2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16(2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16_sse2(above);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32_sse2(above);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16(2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16(2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16(4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_64(left);
+  const __m128i thirtytwo = _mm_set1_epi16(32);
+  sum_left = _mm_add_epi16(sum_left, thirtytwo);
+  sum_left = _mm_srai_epi16(sum_left, 6);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32_sse2(left);
+  const __m128i sixteen = _mm_set1_epi16(16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16_sse2(left);
+  const __m128i eight = _mm_set1_epi16(8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((int8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const uint32_t pred = *(uint32_t *)above;
+  (void)left;
+  dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+  (void)left;
+  dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row = _mm_load_si128((__m128i const *)above);
+  (void)left;
+  dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, int height) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  for (int i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  v_predictor_64xh(dst, stride, above, 16);
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+  left_col = _mm_unpackhi_epi64(left_col, left_col);
+  row0 = _mm_shufflelo_epi16(left_col, 0);
+  row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_load_si128((__m128i const *)left);
+  __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+  __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+  row0 = _mm_shufflelo_epi16(left_col_low, 0);
+  row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+  dst += stride;
+
+  left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+  row0 = _mm_shufflelo_epi16(left_col_high, 0);
+  row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+  row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+  row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+  *(int *)dst = _mm_cvtsi128_si32(row0);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row1);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row2);
+  dst += stride;
+  *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                              const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+  left_col = _mm_unpacklo_epi8(left_col, left_col);
+  __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+  __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+  __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+  __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+  _mm_storel_epi64((__m128i *)dst, row0);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row1);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row2);
+  dst += stride;
+  _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above, const uint8_t *left,
+                                      int count) {
+  (void)above;
+  for (int i = 0; i < count; ++i) {
+    const __m128i left_col = _mm_load_si128((__m128i const *)left);
+    __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+    __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+    __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+    row0 = _mm_shufflelo_epi16(left_col_low, 0);
+    row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+
+    left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+    row0 = _mm_shufflelo_epi16(left_col_high, 0);
+    row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+    row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+    row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+    _mm_storel_epi64((__m128i *)dst, row0);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row1);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row2);
+    dst += stride;
+    _mm_storel_epi64((__m128i *)dst, row3);
+    dst += stride;
+    left += 16;
+  }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    dst += stride;
+  }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+  row[0] = _mm_unpacklo_epi64(u0, u0);
+  row[1] = _mm_unpacklo_epi64(u1, u1);
+  row[2] = _mm_unpacklo_epi64(u2, u2);
+  row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+  const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+  const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+  const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+  const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+  row[0] = _mm_unpackhi_epi64(u0, u0);
+  row[1] = _mm_unpackhi_epi64(u1, u1);
+  row[2] = _mm_unpackhi_epi64(u2, u2);
+  row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_16x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int count) {
+  int i = 0;
+  do {
+    const __m128i left_col = _mm_load_si128((const __m128i *)left);
+    const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
+    dst += stride << 2;
+
+    const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+    h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+    h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
+    dst += stride << 2;
+
+    left += 16;
+    i++;
+  } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_16xh(dst, stride, left, 4);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+                                     ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < h; ++i) {
+    _mm_store_si128((__m128i *)dst, row[i]);
+    _mm_store_si128((__m128i *)(dst + 16), row[i]);
+    dst += stride;
+  }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_low_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+                                       ptrdiff_t stride) {
+  __m128i row[4];
+  repeat_high_4pixels(left, row);
+  h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  __m128i left_col, left_col_8p;
+  (void)above;
+
+  left_col = _mm_load_si128((const __m128i *)left);
+
+  left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+  dst += stride << 2;
+
+  left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+  h_prediction_32x8_1(&left_col_8p, dst, stride);
+  dst += stride << 2;
+  h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *left, int height) {
+  int i = height >> 2;
+  do {
+    __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    left4 = _mm_unpacklo_epi8(left4, left4);
+    const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+    const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r0);
+    _mm_store_si128((__m128i *)(dst + 32), r0);
+    _mm_store_si128((__m128i *)(dst + 48), r0);
+    _mm_store_si128((__m128i *)(dst + stride), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+    _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+    const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+    const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+    _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+    _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+    _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+    left += 4;
+    dst += stride * 4;
+  } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse4.c b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
new file mode 100644
index 0000000000..9de8bf3c0f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
@@ -0,0 +1,1307 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+// Low bit depth functions
+static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
+  { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+      0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+      0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+      0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+      0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0 },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff },
+    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+      0xff, 0xff, 0xff, 0xff } },
+  {
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+        0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+        0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+        0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+        0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0xff, 0 },
+      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+        0xff, 0xff, 0xff, 0xff },
+  },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
+    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+    int dx) {
+  const int frac_bits = 6 - upsample_above;
+  const int max_base_x = ((W + H) - 1) << upsample_above;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i diff, c3f;
+  __m128i a_mbase_x;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < W; r++) {
+    __m128i b, res, res1, shift;
+    __m128i a0_above, a1_above;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base) >> upsample_above;
+    if (base_max_diff <= 0) {
+      for (int i = r; i < W; ++i) {
+        dst[i] = a_mbase_x;  // save 4 values
+      }
+      return;
+    }
+    if (base_max_diff > H) base_max_diff = H;
+    a0_above = _mm_loadu_si128((__m128i *)(above + base));
+    a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+    if (upsample_above) {
+      a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
+      a1_above = _mm_srli_si128(a0_above, 8);
+
+      shift = _mm_srli_epi16(
+          _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
+          1);
+    } else {
+      shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+    }
+    // lower half
+    a0 = _mm_cvtepu8_epi16(a0_above);
+    a1 = _mm_cvtepu8_epi16(a1_above);
+
+    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    // uppar half
+    a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+    a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res1 = _mm_add_epi16(a32, b);
+    res1 = _mm_srli_epi16(res1, 5);
+
+    res = _mm_packus_epi16(res, res1);
+
+    dst[r] =
+        _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        int upsample_above, int dx) {
+  __m128i dstvec[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        int upsample_above, int dx) {
+  __m128i dstvec[32];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  __m128i dstvec[64];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
+                                       dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+  }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
+    int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
+    int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((32 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i a_mbase_x, diff, c3f;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, res1, res16[2];
+    __m128i a0_above, a1_above;
+
+    int base = x >> frac_bits;
+    int base_max_diff = (max_base_x - base);
+    if (base_max_diff <= 0) {
+      for (int i = r; i < N; ++i) {
+        dstvec[i] = a_mbase_x;  // save 32 values
+        dstvec_h[i] = a_mbase_x;
+      }
+      return;
+    }
+    if (base_max_diff > 32) base_max_diff = 32;
+    __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+
+    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+      int mdiff = base_max_diff - j;
+      if (mdiff <= 0) {
+        res16[jj] = a_mbase_x;
+      } else {
+        a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
+        a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+
+        // lower half
+        a0 = _mm_cvtepu8_epi16(a0_above);
+        a1 = _mm_cvtepu8_epi16(a1_above);
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm_mullo_epi16(diff, shift);
+
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);
+
+        // uppar half
+        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);
+
+        res16[jj] = _mm_packus_epi16(res, res1);  // 16 8bit values
+      }
+    }
+
+    dstvec[r] =
+        _mm_blendv_epi8(a_mbase_x, res16[0],
+                        *(__m128i *)Mask[0][base_max_diff]);  // 16 8bit values
+
+    dstvec_h[r] =
+        _mm_blendv_epi8(a_mbase_x, res16[1],
+                        *(__m128i *)Mask[1][base_max_diff]);  // 16 8bit values
+    x += dx;
+  }
+}
+
+static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  __m128i dstvec[64], dstvec_h[64];
+  dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
+                                        upsample_above, dx);
+  for (int i = 0; i < N; i++) {
+    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+    _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
+  }
+}
+
+static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *above,
+                                         int upsample_above, int dx) {
+  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+  (void)upsample_above;
+  const int frac_bits = 6;
+  const int max_base_x = ((64 + N) - 1);
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0, a1, a32, a16;
+  __m128i a_mbase_x, diff, c3f;
+  __m128i max_base, base_inc, mask;
+
+  a16 = _mm_set1_epi16(16);
+  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+  max_base = _mm_set1_epi8(max_base_x);
+  c3f = _mm_set1_epi16(0x3f);
+
+  int x = dx;
+  for (int r = 0; r < N; r++, dst += stride) {
+    __m128i b, res, res1;
+    int base = x >> frac_bits;
+    if (base >= max_base_x) {
+      for (int i = r; i < N; ++i) {
+        _mm_storeu_si128((__m128i *)dst, a_mbase_x);  // save 32 values
+        _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
+        _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
+        _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
+        dst += stride;
+      }
+      return;
+    }
+
+    __m128i shift =
+        _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);  // 8 element
+
+    __m128i a0_above, a1_above, res_val;
+    for (int j = 0; j < 64; j += 16) {
+      int mdif = max_base_x - (base + j);
+      if (mdif <= 0) {
+        _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
+      } else {
+        a0_above =
+            _mm_loadu_si128((__m128i *)(above + base + j));  // load 16 element
+        a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+
+        // lower half
+        a0 = _mm_cvtepu8_epi16(a0_above);
+        a1 = _mm_cvtepu8_epi16(a1_above);
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+        b = _mm_mullo_epi16(diff, shift);
+
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);
+
+        // uppar half
+        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);
+
+        res = _mm_packus_epi16(res, res1);  // 16 8bit values
+
+        base_inc =
+            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+        mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
+                              _mm_setzero_si128());
+        res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
+        _mm_storeu_si128((__m128i *)(dst + j), res_val);
+      }
+    }
+    x += dx;
+  }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_above, int dx, int dy) {
+  (void)left;
+  (void)dy;
+  switch (bw) {
+    case 4:
+      dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 8:
+      dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 16:
+      dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 32:
+      dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    case 64:
+      dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+      break;
+    default: assert(0 && "Invalid block size");
+  }
+  return;
+}
+
+static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  assert(dx > 0);
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i a0_x, a1_x, a32, diff;
+
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+  const __m128i dy_reg = _mm_set1_epi16(dy);
+  const __m128i a16 = _mm_set1_epi16(16);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, shift, r6, ydx;
+    __m128i resx, resy, resxy;
+    __m128i a0_above, a1_above;
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 4) {
+      base_min_diff = 4;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 3) {
+      a0_x = _mm_setzero_si128();
+      a1_x = _mm_setzero_si128();
+      shift = _mm_setzero_si128();
+    } else {
+      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(c1234, 6);
+
+      if (upsample_above) {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 1);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_above);
+      a1_x = _mm_cvtepu8_epi16(a1_above);
+    }
+    // y calc
+    __m128i a0_y, a1_y, shifty;
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+      __m128i y_c, base_y_c_reg, mask, c1234_;
+      c1234_ = _mm_srli_si128(c1234, 2);
+      r6 = _mm_set1_epi16(r << 6);
+      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
+      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+      if (upsample_left) {
+        shifty = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+      } else {
+        shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+      }
+      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+      shift = _mm_unpacklo_epi64(shift, shifty);
+    }
+
+    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+    b = _mm_mullo_epi16(diff, shift);
+    res = _mm_add_epi16(a32, b);
+    res = _mm_srli_epi16(res, 5);
+
+    resx = _mm_packus_epi16(res, res);
+    resy = _mm_srli_si128(resx, 4);
+
+    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  const int min_base_x = -(1 << upsample_above);
+  const int min_base_y = -(1 << upsample_left);
+  const int frac_bits_x = 6 - upsample_above;
+  const int frac_bits_y = 6 - upsample_left;
+
+  // pre-filter above pixels
+  // store in temp buffers:
+  //   above[x] * 32 + 16
+  //   above[x+1] - above[x]
+  // final pixels will be calculated as:
+  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+  __m128i diff, a32;
+  __m128i a0_x, a1_x, a0_y, a1_y;
+  __m128i a0_above, a1_above;
+
+  const __m128i a16 = _mm_set1_epi16(16);
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i dy_reg = _mm_set1_epi16(dy);
+  const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+  for (int r = 0; r < N; r++) {
+    __m128i b, res, res1, shift;
+    __m128i resx, resy, resxy, r6, ydx;
+
+    int y = r + 1;
+    int base_x = (-y * dx) >> frac_bits_x;
+    int base_shift = 0;
+    if (base_x < (min_base_x - 1)) {
+      base_shift = (min_base_x - base_x - 1) >> upsample_above;
+    }
+    int base_min_diff =
+        (min_base_x - base_x + upsample_above) >> upsample_above;
+    if (base_min_diff > 8) {
+      base_min_diff = 8;
+    } else {
+      if (base_min_diff < 0) base_min_diff = 0;
+    }
+
+    if (base_shift > 7) {
+      resx = _mm_setzero_si128();
+    } else {
+      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+      ydx = _mm_set1_epi16(y * dx);
+      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+      if (upsample_above) {
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+        a1_above = _mm_srli_si128(a0_above, 8);
+
+        shift = _mm_srli_epi16(
+            _mm_and_si128(
+                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+            1);
+      } else {
+        a1_above = _mm_srli_si128(a0_above, 1);
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above =
+            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+      }
+      a0_x = _mm_cvtepu8_epi16(a0_above);
+      a1_x = _mm_cvtepu8_epi16(a1_above);
+
+      diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+      a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm_mullo_epi16(diff, shift);
+      res = _mm_add_epi16(a32, b);
+      res = _mm_srli_epi16(res, 5);
+      resx = _mm_packus_epi16(res, res);
+    }
+
+    // y calc
+    if (base_x < min_base_x) {
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+      __m128i y_c, base_y_c_reg, mask;
+      r6 = _mm_set1_epi16(r << 6);
+      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
+      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                            left[base_y_c[2]], left[base_y_c[3]],
+                            left[base_y_c[4]], left[base_y_c[5]],
+                            left[base_y_c[6]], left[base_y_c[7]]);
+
+      if (upsample_left) {
+        shift = _mm_srli_epi16(
+            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+      } else {
+        shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+      }
+
+      diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+      a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
+      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+      b = _mm_mullo_epi16(diff, shift);
+      res1 = _mm_add_epi16(a32, b);
+      res1 = _mm_srli_epi16(res1, 5);
+
+      resy = _mm_packus_epi16(res1, res1);
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+      _mm_storel_epi64((__m128i *)dst, resxy);
+    } else {
+      _mm_storel_epi64((__m128i *)dst, resx);
+    }
+
+    dst += stride;
+  }
+}
+
+static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
+                                        ptrdiff_t stride, const uint8_t *above,
+                                        const uint8_t *left, int upsample_above,
+                                        int upsample_left, int dx, int dy) {
+  // here upsample_above and upsample_left are 0 by design of
+  // av1_use_intra_edge_upsample
+  const int min_base_x = -1;
+  const int min_base_y = -1;
+  (void)upsample_above;
+  (void)upsample_left;
+  const int frac_bits_x = 6;
+  const int frac_bits_y = 6;
+
+  __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
+  __m128i diff, shifty, shifty_h;
+  __m128i a0_above, a1_above;
+
+  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+  const __m128i a16 = _mm_set1_epi16(16);
+  const __m128i c1 = _mm_srli_epi16(a16, 4);
+  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+  const __m128i c3f = _mm_set1_epi16(0x3f);
+  const __m128i dy256 = _mm_set1_epi16(dy);
+  const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+  const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+  const __m128i c1234 = _mm_add_epi16(c0123, c1);
+  const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
+
+  for (int r = 0; r < H; r++) {
+    __m128i b, res, res1, shift, reg_j, r6, ydx;
+    __m128i resx, resy;
+    __m128i resxy;
+    int y = r + 1;
+    ydx = _mm_set1_epi16((int16_t)(y * dx));
+
+    int base_x = (-y * dx) >> frac_bits_x;
+    for (int j = 0; j < W; j += 16) {
+      reg_j = _mm_set1_epi16(j);
+      int base_shift = 0;
+      if ((base_x + j) < (min_base_x - 1)) {
+        base_shift = (min_base_x - (base_x + j) - 1);
+      }
+      int base_min_diff = (min_base_x - base_x - j);
+      if (base_min_diff > 16) {
+        base_min_diff = 16;
+      } else {
+        if (base_min_diff < 0) base_min_diff = 0;
+      }
+
+      if (base_shift < 16) {
+        a0_above =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+        a1_above =
+            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+        a0_above =
+            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+        a1_above =
+            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+        a0_x = _mm_cvtepu8_epi16(a0_above);
+        a1_x = _mm_cvtepu8_epi16(a1_above);
+
+        r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
+
+        a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+        a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+        r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
+        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shift);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
+
+        resx = _mm_packus_epi16(res, res1);
+      } else {
+        resx = _mm_setzero_si128();
+      }
+
+      // y calc
+      if (base_x < min_base_x) {
+        __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
+        __m128i mask, mask_h, mul16, mul16_h;
+        r6 = _mm_set1_epi16(r << 6);
+        c_reg = _mm_add_epi16(reg_j, c1234);
+        c_reg_h = _mm_add_epi16(reg_j, c1234_h);
+        mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
+                              _mm_srli_epi16(min_y_base, 1));
+        mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
+                                _mm_srli_epi16(min_y_base, 1));
+        y_reg = _mm_sub_epi16(r6, mul16);
+        y_reg_h = _mm_sub_epi16(r6, mul16_h);
+
+        base_y = _mm_srai_epi16(y_reg, frac_bits_y);
+        base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
+        mask = _mm_cmpgt_epi16(min_y_base, base_y);
+        mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
+
+        base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
+        base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
+        int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
+        int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
+        int16_t offset_diff = max_y - min_y;
+
+        if (offset_diff < 16) {
+          __m128i min_y_reg = _mm_set1_epi16(min_y);
+
+          __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
+          __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
+          __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
+
+          __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
+          __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
+          __m128i LoadMask =
+              _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
+
+          a0_mask = _mm_and_si128(a0_mask, LoadMask);
+          a1_mask = _mm_and_si128(a1_mask, LoadMask);
+
+          a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
+          a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
+          a0_y = _mm_cvtepu8_epi16(a0_mask);
+          a1_y = _mm_cvtepu8_epi16(a1_mask);
+          a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
+          a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
+        } else {
+          base_y = _mm_andnot_si128(mask, base_y);
+          base_y_h = _mm_andnot_si128(mask_h, base_y_h);
+          _mm_store_si128((__m128i *)base_y_c, base_y);
+          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+          a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                                left[base_y_c[2]], left[base_y_c[3]],
+                                left[base_y_c[4]], left[base_y_c[5]],
+                                left[base_y_c[6]], left[base_y_c[7]]);
+          a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+                                  left[base_y_c[10]], left[base_y_c[11]],
+                                  left[base_y_c[12]], left[base_y_c[13]],
+                                  left[base_y_c[14]], left[base_y_c[15]]);
+          base_y = _mm_add_epi16(base_y, c1);
+          base_y_h = _mm_add_epi16(base_y_h, c1);
+          _mm_store_si128((__m128i *)base_y_c, base_y);
+          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+          a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+                                left[base_y_c[2]], left[base_y_c[3]],
+                                left[base_y_c[4]], left[base_y_c[5]],
+                                left[base_y_c[6]], left[base_y_c[7]]);
+          a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+                                  left[base_y_c[10]], left[base_y_c[11]],
+                                  left[base_y_c[12]], left[base_y_c[13]],
+                                  left[base_y_c[14]], left[base_y_c[15]]);
+        }
+        shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
+        shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
+
+        diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shifty);
+        res = _mm_add_epi16(a32, b);
+        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
+
+        diff = _mm_sub_epi16(a1_y_h, a0_y_h);  // a[x+1] - a[x]
+        a32 = _mm_slli_epi16(a0_y_h, 5);       // a[x] * 32
+        a32 = _mm_add_epi16(a32, a16);         // a[x] * 32 + 16
+
+        b = _mm_mullo_epi16(diff, shifty_h);
+        res1 = _mm_add_epi16(a32, b);
+        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
+        resy = _mm_packus_epi16(res, res1);
+      } else {
+        resy = _mm_setzero_si128();
+      }
+      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+      _mm_storeu_si128((__m128i *)(dst + j), resxy);
+    }  // for j
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_above, int upsample_left, int dx,
+                                 int dy) {
+  assert(dx > 0);
+  assert(dy > 0);
+  switch (bw) {
+    case 4:
+      dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
+                                  upsample_left, dx, dy);
+      break;
+    case 8:
+      dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
+                                  upsample_left, dx, dy);
+      break;
+    default:
+      dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
+                                  upsample_above, upsample_left, dx, dy);
+  }
+  return;
+}
+
+// z3 functions
+static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[4], d[4];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                            &d[0], &d[1], &d[2], &d[3]);
+
+  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+  return;
+}
+
+static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+                    &d[3]);
+
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[4], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
+  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+  for (int i = 0; i < 8; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *left, int upsample_left,
+                                        int dy) {
+  __m128i dstvec[8], d[4];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
+  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+                        &d[1], &d[2], &d[3]);
+  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[8], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
+  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+  for (int i = 0; i < 8; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+                     _mm_srli_si128(d[i], 8));
+  }
+}
+
+static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[4], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
+  transpose4x16_sse2(dstvec, d);
+  for (int i = 0; i < 16; i++) {
+    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+  }
+}
+
+static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[8];
+
+  dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
+  for (int i = 4; i < 8; i++) {
+    d[i] = _mm_setzero_si128();
+  }
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+  for (int i = 0; i < 4; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+  dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  for (int i = 8; i < 16; i++) {
+    dstvec[i] = _mm_setzero_si128();
+    dstvec_h[i] = _mm_setzero_si128();
+  }
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+  }
+  for (int i = 0; i < 16; i++) {
+    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
+  }
+}
+
+static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                         const uint8_t *left, int upsample_left,
+                                         int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
+
+  transpose16x8_8x16_sse2(
+      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+      &d[3], &d[4], &d[5], &d[6], &d[7]);
+  transpose16x8_8x16_sse2(
+      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+      &d[6 + 8], &d[7 + 8]);
+
+  for (int i = 0; i < 8; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+  }
+}
+
+static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[16], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+
+  for (int i = 0; i < 16; i++) {
+    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+  }
+}
+
+static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
+
+  dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+  transpose16x16_sse2(dstvec + 16, d + 16);
+  transpose16x16_sse2(dstvec_h + 16, d_h + 16);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+    _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
+  }
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
+  }
+}
+
+static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 64];
+  dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+  dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
+                                        upsample_left, dy);
+  transpose16x16_sse2(dstvec, d);
+  transpose16x16_sse2(dstvec_h, d_h);
+  // store
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+  }
+}
+
+static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[32], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 32; i += 16) {
+    transpose16x16_sse2((dstvec + i), d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 32];
+  dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[32 * 64];
+  dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
+  transpose(dstT, 32, dst, stride, 64, 32);
+  return;
+}
+
+static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  uint8_t dstT[64 * 16];
+  dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
+  transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                          const uint8_t *left,
+                                          int upsample_left, int dy) {
+  __m128i dstvec[64], d[16];
+
+  dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
+  for (int i = 0; i < 64; i += 16) {
+    transpose16x16_sse2(dstvec + i, d);
+    for (int j = 0; j < 16; j++) {
+      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+    }
+  }
+}
+
+void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+                                 const uint8_t *above, const uint8_t *left,
+                                 int upsample_left, int dx, int dy) {
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy > 0);
+
+  if (bw == bh) {
+    switch (bw) {
+      case 4:
+        dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 8:
+        dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 16:
+        dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 32:
+        dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      case 64:
+        dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
+        break;
+      default: assert(0 && "Invalid block size");
+    }
+  } else {
+    if (bw < bh) {
+      if (bw + bw == bh) {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      } else {
+        switch (bw) {
+          case 4:
+            dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      }
+    } else {
+      if (bh + bh == bw) {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 32:
+            dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      } else {
+        switch (bh) {
+          case 4:
+            dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 8:
+            dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          case 16:
+            dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
+            break;
+          default: assert(0 && "Invalid block size");
+        }
+      }
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 0000000000..fd48260c6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,2997 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+                                     const __m128i *topleft) {
+  const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+  __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+  __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+  __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+  __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+  mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+  __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+  pl = _mm_andnot_si128(mask1, *left);
+
+  ptl = _mm_and_si128(mask2, *topleft);
+  pt = _mm_andnot_si128(mask2, *top);
+  pt = _mm_or_si128(pt, ptl);
+  pt = _mm_and_si128(mask1, pt);
+
+  return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+    _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+      _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+                                      const __m128i *top1,
+                                      const __m128i *topleft) {
+  const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+  const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+  return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int i = 0; i < 4; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 8; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    const __m128i l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  rep = _mm_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+    _mm_store_si128((__m128i *)dst, row);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+  __m128i l16;
+
+  for (int i = 0; i < 8; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i l16;
+
+  int i;
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+
+  rep = _mm_set1_epi16((short)0x8000);
+  l = _mm_load_si128((const __m128i *)(left + 16));
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r32l);
+    _mm_store_si128((__m128i *)(dst + 16), r32h);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r32l);
+      _mm_store_si128((__m128i *)(dst + 16), r32h);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 2; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i, j;
+  for (j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16((short)0x8000);
+    for (i = 0; i < 16; ++i) {
+      l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+      const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+      const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+      const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+      _mm_store_si128((__m128i *)dst, r0);
+      _mm_store_si128((__m128i *)(dst + 16), r1);
+      _mm_store_si128((__m128i *)(dst + 32), r2);
+      _mm_store_si128((__m128i *)(dst + 48), r3);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i a = _mm_load_si128((const __m128i *)above);
+  const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+  const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+  const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i al = _mm_unpacklo_epi8(a, zero);
+  const __m128i ah = _mm_unpackhi_epi8(a, zero);
+  const __m128i bl = _mm_unpacklo_epi8(b, zero);
+  const __m128i bh = _mm_unpackhi_epi8(b, zero);
+  const __m128i cl = _mm_unpacklo_epi8(c, zero);
+  const __m128i ch = _mm_unpackhi_epi8(c, zero);
+  const __m128i dl = _mm_unpacklo_epi8(d, zero);
+  const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+  const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i l16;
+
+  int i;
+  const __m128i l = _mm_load_si128((const __m128i *)left);
+  __m128i rep = _mm_set1_epi16((short)0x8000);
+  for (i = 0; i < 16; ++i) {
+    l16 = _mm_shuffle_epi8(l, rep);
+    const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+    const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+    const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+    const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+    _mm_store_si128((__m128i *)dst, r0);
+    _mm_store_si128((__m128i *)(dst + 16), r1);
+    _mm_store_si128((__m128i *)(dst + 32), r2);
+    _mm_store_si128((__m128i *)(dst + 48), r3);
+    dst += stride;
+    rep = _mm_add_epi16(rep, one);
+  }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
+  if (height == 4)
+    pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
+  else if (height == 8)
+    pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+  else
+    pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
+  pixels[2] = _mm_set1_epi16((int16_t)above[3]);
+
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+  const __m128i zero = _mm_setzero_si128();
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(int height, __m128i *weight_h,
+                                  __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
+  weight_h[0] = _mm_unpacklo_epi8(t, zero);
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+  if (height == 8) {
+    const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+  } else if (height == 16) {
+    const __m128i weight =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+    weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set1_epi32(0xc080400);
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  for (int i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+    b = _mm_unpacklo_epi16(b, pixel[2]);
+    __m128i sum = _mm_madd_epi16(b, ww[0]);
+
+    sum = _mm_add_epi32(s, sum);
+    sum = _mm_add_epi32(sum, round);
+    sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+    sum = _mm_shuffle_epi8(sum, gat);
+    *(int *)dst = _mm_cvtsi128_si32(sum);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(4, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w4(16, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+                                 int height, __m128i *pixels) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+  __m128i d = _mm_loadl_epi64((const __m128i *)above);
+  d = _mm_unpacklo_epi8(d, zero);
+  pixels[0] = _mm_unpacklo_epi16(d, bp);
+  pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+  pixels[3] = _mm_set1_epi16((int16_t)above[7]);
+
+  if (height == 4) {
+    pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
+  } else if (height == 8) {
+    pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+  } else if (height == 16) {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+  } else {
+    pixels[2] = _mm_load_si128((const __m128i *)left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(int height, __m128i *weight_h,
+                                  __m128i *weight_w) {
+  const __m128i zero = _mm_setzero_si128();
+  const int we_offset = height < 8 ? 0 : 4;
+  __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
+  weight_h[0] = _mm_unpacklo_epi8(we, zero);
+  const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+  if (height == 4) {
+    we = _mm_srli_si128(we, 4);
+    __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+    __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+    weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+    weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+  } else {
+    weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+    weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+  }
+
+  if (height == 16) {
+    we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+    weight_h[0] = _mm_unpacklo_epi8(we, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(we, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+  } else if (height == 32) {
+    const __m128i weight_lo =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
+    weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+    weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+    weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+    weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+    const __m128i weight_hi =
+        _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
+    weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+    weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+    weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+    weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+                                   const __m128i *ww, int h, uint8_t *dst,
+                                   ptrdiff_t stride, int second_half) {
+  const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i inc = _mm_set1_epi16(0x202);
+  const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+  __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+                            : _mm_set1_epi16((short)0x8000);
+  __m128i d = _mm_set1_epi16(0x100);
+
+  int i;
+  for (i = 0; i < h; ++i) {
+    const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+    const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+    const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+    __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+    __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+    __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+    b = _mm_unpacklo_epi16(b, pixels[3]);
+    __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+    __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+    s0 = _mm_add_epi32(s0, sum0);
+    s0 = _mm_add_epi32(s0, round);
+    s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+    s1 = _mm_add_epi32(s1, sum1);
+    s1 = _mm_add_epi32(s1, round);
+    s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+    sum0 = _mm_packus_epi16(s0, s1);
+    sum0 = _mm_shuffle_epi8(sum0, gat);
+    _mm_storel_epi64((__m128i *)dst, sum0);
+    dst += stride;
+
+    rep = _mm_add_epi16(rep, one);
+    d = _mm_add_epi16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 4, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(4, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 8, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(8, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[4];
+  load_pixel_w8(above, left, 16, pixels);
+
+  __m128i wh[4], ww[2];
+  load_weight_w8(16, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m128i pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  __m128i wh[8], ww[2];
+  load_weight_w8(32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
+// (available in 2019+) or greater is specified; __restrict can be used in that
+// case. This should be moved to rtcd and used consistently between the
+// function declarations and definitions to avoid warnings in Visual Studio
+// when defining LIBAOM_RESTRICT to restrict or __restrict.
+#if defined(_MSC_VER)
+#define LIBAOM_RESTRICT
+#else
+#define LIBAOM_RESTRICT restrict
+#endif
+
+static AOM_FORCE_INLINE __m128i Load4(const void *src) {
+  // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+  // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+  // movss instruction.
+  //
+  // Until compiler support of _mm_loadu_si32 is widespread, use of
+  // _mm_loadu_si32 is banned.
+  int val;
+  memcpy(&val, src, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
+  return _mm_loadl_epi64((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
+  return _mm_loadu_si128((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
+  const int val = _mm_cvtsi128_si32(x);
+  memcpy(dst, &val, sizeof(val));
+}
+
+static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
+  _mm_storel_epi64((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
+  return _mm_unpacklo_epi8((x), _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
+  const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
+  return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
+  return _mm_unpacklo_epi16((x), _mm_setzero_si128());
+}
+
+void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+                          const uint8_t *LIBAOM_RESTRICT top_row,
+                          const uint8_t *LIBAOM_RESTRICT left_column, int width,
+                          int height) {
+  const uint8_t *const sm_weights_h = smooth_weights + height - 4;
+  const uint8_t *const sm_weights_w = smooth_weights + width - 4;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
+  const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
+  const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  for (int y = 0; y < height; ++y) {
+    const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+    const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
+    const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+    __m128i scaled_bottom_left =
+        _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+    const __m128i weight_left_y =
+        _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+    scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+    scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+    for (int x = 0; x < width; x += 8) {
+      const __m128i top_x = LoadLo8(top_row + x);
+      const __m128i weights_x = LoadLo8(sm_weights_w + x);
+      const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+      const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
+      const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+      // Here opposite weights and pixels are multiplied, where the order of
+      // interleaving is indicated in the names.
+      __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+      __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+      // |scaled_bottom_left| is always scaled by the same weight each row, so
+      // we only derive |scaled_top_right| values here.
+      const __m128i inverted_weights_x =
+          _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
+      const __m128i scaled_top_right =
+          _mm_mullo_epi16(inverted_weights_x, top_right);
+      const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
+      const __m128i scaled_top_right_hi =
+          _mm_unpackhi_epi16(scaled_top_right, zero);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+      pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+      pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+      // The round value for RightShiftWithRounding was added with
+      // |scaled_bottom_left|.
+      pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+      pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+      const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+      StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+    }
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+// -----------------------------------------------------------------------------
+// Smooth horizontal/vertical helper functions.
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+static AOM_FORCE_INLINE void write_smooth_directional_sum16(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
+    const __m128i weights1, const __m128i weights2,
+    const __m128i scaled_corner1, const __m128i scaled_corner2,
+    const __m128i round) {
+  const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+  const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+  const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+  const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+  const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+  StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
+}
+
+static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
+    const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
+  const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+  return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+static AOM_FORCE_INLINE void write_smooth_directional_sum8(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
+    const __m128i *scaled_corner, const __m128i *round) {
+  const __m128i pred_sum =
+      smooth_directional_sum8(*pixels, *weights, *scaled_corner);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
+  StoreLo8(dst, _mm_packus_epi16(pred, pred));
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
+    const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
+    const int height, __m128i *pixels) {
+  __m128i top = Load4(above);
+  const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+  top = cvtepu8_epi16(top);
+  pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
+    const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
+    __m128i *weights) {
+  const __m128i inverter = _mm_set1_epi16(256);
+
+  if (height == 4) {
+    const __m128i weight = Load4(weight_array);
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else if (height == 8) {
+    const __m128i weight = LoadLo8(weight_array + 4);
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+  } else {
+    const __m128i weight = LoadUnaligned16(weight_array + 12);
+    const __m128i zero = _mm_setzero_si128();
+    weights[0] = cvtepu8_epi16(weight);
+    weights[1] = _mm_sub_epi16(inverter, weights[0]);
+    weights[2] = _mm_unpackhi_epi8(weight, zero);
+    weights[3] = _mm_sub_epi16(inverter, weights[2]);
+  }
+}
+
+static AOM_FORCE_INLINE void write_smooth_vertical4xh(
+    const __m128i *pixel, const __m128i *weight, const int height,
+    uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
+  const __m128i pred_round = _mm_set1_epi32(128);
+  const __m128i mask_increment = _mm_set1_epi16(0x0202);
+  const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+  __m128i y_select = _mm_set1_epi16(0x0100);
+
+  for (int y = 0; y < height; ++y) {
+    const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+    const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+    const __m128i alternate_weights =
+        _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+    // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+    // The madd instruction yields four results of the form:
+    // (top_row[x] * weight[y] + corner * inverted_weight[y])
+    __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+    sum = _mm_add_epi32(sum, pred_round);
+    sum = _mm_srai_epi32(sum, 8);
+    sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+    Store4(dst, sum);
+    dst += stride;
+    y_select = _mm_add_epi16(y_select, mask_increment);
+  }
+}
+
+void aom_smooth_v_predictor_4x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  __m128i pixels;
+  load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
+
+  __m128i weights[2];
+  load_smooth_vertical_weights4(smooth_weights, 4, weights);
+
+  write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  __m128i pixels;
+  load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
+
+  __m128i weights[2];
+  load_smooth_vertical_weights4(smooth_weights, 8, weights);
+
+  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  __m128i pixels;
+  load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
+
+  __m128i weights[4];
+  load_smooth_vertical_weights4(smooth_weights, 16, weights);
+
+  write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+  dst += stride << 3;
+  write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                &round);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_8x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+  const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+  __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  weights_y = _mm_shuffle_epi8(weights, y_select);
+  scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+  write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                 scaled_bottom_left_y, scaled_bottom_left_y,
+                                 round);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights_lo = cvtepu8_epi16(weights);
+  const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+  const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+  const __m128i scaled_bottom_left_lo =
+      _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+  const __m128i scaled_bottom_left_hi =
+      _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(128);
+
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(128);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = LoadUnaligned16(top_row);
+  const __m128i top_lo = cvtepu8_epi16(top);
+  const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+  }
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  __m128i scale = _mm_set1_epi16(256);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_bottom_left =
+      _mm_mullo_epi16(inverted_weights, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_32x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_32x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_32x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i top_lo = LoadUnaligned16(top_row);
+  const __m128i top_hi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_hi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+  }
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_bottom_left1 =
+      _mm_mullo_epi16(inverted_weights1, bottom_left);
+  const __m128i scaled_bottom_left2 =
+      _mm_mullo_epi16(inverted_weights2, bottom_left);
+  const __m128i scaled_bottom_left3 =
+      _mm_mullo_epi16(inverted_weights3, bottom_left);
+  const __m128i scaled_bottom_left4 =
+      _mm_mullo_epi16(inverted_weights4, bottom_left);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+    const __m128i scaled_bottom_left_y =
+        _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+    write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                   scaled_bottom_left_y, scaled_bottom_left_y,
+                                   round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_v_predictor_64x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+  const __m128i top_lolo = LoadUnaligned16(top_row);
+  const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+  const __m128i top1 = cvtepu8_epi16(top_lolo);
+  const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+  const __m128i top3 = cvtepu8_epi16(top_lohi);
+  const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+  const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+  const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+  const __m128i top5 = cvtepu8_epi16(top_hilo);
+  const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+  const __m128i top7 = cvtepu8_epi16(top_hihi);
+  const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i round = _mm_set1_epi16(128);
+  const uint8_t *weights_base_ptr = smooth_weights + 60;
+  for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+    const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+    const __m128i weights_lo = cvtepu8_epi16(weights);
+    const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+    const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+    const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+    const __m128i scaled_bottom_left_lo =
+        _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+    const __m128i scaled_bottom_left_hi =
+        _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+      const __m128i scaled_bottom_left_y =
+          _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+      write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+                                     scaled_bottom_left_y, scaled_bottom_left_y,
+                                     round);
+      dst += stride;
+    }
+  }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
+    uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
+    const __m128i *scaled_top_right, const __m128i *round) {
+  const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
+  const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
+  // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+  const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
+  const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+  Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i left = cvtepu8_epi32(Load4(left_column));
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi32(top_row[3]);
+  const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi32(Load4(left_column));
+  __m128i left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 4));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 8));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+
+  left = cvtepu8_epi32(Load4(left_column + 12));
+  left_y = _mm_shuffle_epi32(left, 0);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0x55);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xaa);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+  dst += stride;
+  left_y = _mm_shuffle_epi32(left, 0xff);
+  write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+                               &round);
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+void aom_smooth_h_predictor_8x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i left = cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_select = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+  dst += stride;
+  y_select = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_select);
+  write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                &round);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_8x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[7]);
+  const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+  const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+                                  &round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i left = cvtepu8_epi16(Load4(left_column));
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i y_mask = _mm_set1_epi32(0x01000100);
+  __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x03020302);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x05040504);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+  dst += stride;
+  y_mask = _mm_set1_epi32(0x07060706);
+  left_y = _mm_shuffle_epi8(left, y_mask);
+  write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                 scaled_top_right1, scaled_top_right2, round);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[15]);
+  const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      dst += stride;
+    }
+  }
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  const __m128i left2 =
+      cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+  left = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[31]);
+  const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+  const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_hi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+                                     weights4, scaled_top_right3,
+                                     scaled_top_right4, round);
+      dst += stride;
+    }
+  }
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    __m128i y_select = _mm_set1_epi32(y_mask);
+    __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+  const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
+  for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+    const __m128i y_select = _mm_set1_epi32(y_mask);
+    const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+    write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                   scaled_top_right1, scaled_top_right2, round);
+    write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+                                   scaled_top_right3, scaled_top_right4, round);
+    write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+                                   scaled_top_right5, scaled_top_right6, round);
+    write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+                                   scaled_top_right7, scaled_top_right8, round);
+    dst += stride;
+  }
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(
+    uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+    const uint8_t *LIBAOM_RESTRICT top_row,
+    const uint8_t *LIBAOM_RESTRICT left_column) {
+  const __m128i top_right = _mm_set1_epi16(top_row[63]);
+  const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+  const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+  const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+  const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+  const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+  const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+  const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+  const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+  const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+  const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+  const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+  const __m128i scaled_top_right1 =
+      _mm_mullo_epi16(inverted_weights1, top_right);
+  const __m128i scaled_top_right2 =
+      _mm_mullo_epi16(inverted_weights2, top_right);
+  const __m128i scaled_top_right3 =
+      _mm_mullo_epi16(inverted_weights3, top_right);
+  const __m128i scaled_top_right4 =
+      _mm_mullo_epi16(inverted_weights4, top_right);
+  const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+  const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+  const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+  const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+  const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+  const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+  const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+  const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+  const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+  const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+  const __m128i scaled_top_right5 =
+      _mm_mullo_epi16(inverted_weights5, top_right);
+  const __m128i scaled_top_right6 =
+      _mm_mullo_epi16(inverted_weights6, top_right);
+  const __m128i scaled_top_right7 =
+      _mm_mullo_epi16(inverted_weights7, top_right);
+  const __m128i scaled_top_right8 =
+      _mm_mullo_epi16(inverted_weights8, top_right);
+  const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+  for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+    const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+    for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+      const __m128i y_select = _mm_set1_epi32(y_mask);
+      const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+      write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+                                     scaled_top_right1, scaled_top_right2,
+                                     round);
+      write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+                                     weights4, scaled_top_right3,
+                                     scaled_top_right4, round);
+      write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
+                                     weights6, scaled_top_right5,
+                                     scaled_top_right6, round);
+      write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
+                                     weights8, scaled_top_right7,
+                                     scaled_top_right8, round);
+      dst += stride;
+    }
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_utils.h b/third_party/aom/aom_dsp/x86/intrapred_utils.h
new file mode 100644
index 0000000000..502574673e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_utils.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+  { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
+  { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
+  { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
+  { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[0], x[1]);
+  w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+  ww0 = _mm_unpacklo_epi16(w0, w1);
+  ww1 = _mm_unpacklo_epi16(w2, w3);
+  ww2 = _mm_unpackhi_epi16(w0, w1);
+  ww3 = _mm_unpackhi_epi16(w2, w3);
+
+  w0 = _mm_unpacklo_epi32(ww0, ww1);
+  w2 = _mm_unpacklo_epi32(ww2, ww3);
+  w1 = _mm_unpackhi_epi32(ww0, ww1);
+  w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+  d[0] = _mm_unpacklo_epi64(w0, w2);
+  d[1] = _mm_unpackhi_epi64(w0, w2);
+  d[2] = _mm_unpacklo_epi64(w1, w3);
+  d[3] = _mm_unpackhi_epi64(w1, w3);
+
+  d[4] = _mm_srli_si128(d[0], 8);
+  d[5] = _mm_srli_si128(d[1], 8);
+  d[6] = _mm_srli_si128(d[2], 8);
+  d[7] = _mm_srli_si128(d[3], 8);
+
+  d[8] = _mm_srli_si128(d[0], 4);
+  d[9] = _mm_srli_si128(d[1], 4);
+  d[10] = _mm_srli_si128(d[2], 4);
+  d[11] = _mm_srli_si128(d[3], 4);
+
+  d[12] = _mm_srli_si128(d[0], 12);
+  d[13] = _mm_srli_si128(d[1], 12);
+  d[14] = _mm_srli_si128(d[2], 12);
+  d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(x[0], x[1]);
+  w1 = _mm_unpacklo_epi8(x[2], x[3]);
+  w2 = _mm_unpacklo_epi8(x[4], x[5]);
+  w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+  w8 = _mm_unpacklo_epi8(x[8], x[9]);
+  w9 = _mm_unpacklo_epi8(x[10], x[11]);
+  w10 = _mm_unpacklo_epi8(x[12], x[13]);
+  w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[0] = _mm_unpacklo_epi64(w6, w14);
+  d[1] = _mm_unpackhi_epi64(w6, w14);
+  d[2] = _mm_unpacklo_epi64(w7, w15);
+  d[3] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[4] = _mm_unpacklo_epi64(w6, w14);
+  d[5] = _mm_unpackhi_epi64(w6, w14);
+  d[6] = _mm_unpacklo_epi64(w7, w15);
+  d[7] = _mm_unpackhi_epi64(w7, w15);
+
+  // upper half
+  w0 = _mm_unpackhi_epi8(x[0], x[1]);
+  w1 = _mm_unpackhi_epi8(x[2], x[3]);
+  w2 = _mm_unpackhi_epi8(x[4], x[5]);
+  w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+  w8 = _mm_unpackhi_epi8(x[8], x[9]);
+  w9 = _mm_unpackhi_epi8(x[10], x[11]);
+  w10 = _mm_unpackhi_epi8(x[12], x[13]);
+  w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  d[8] = _mm_unpacklo_epi64(w6, w14);
+  d[9] = _mm_unpackhi_epi64(w6, w14);
+  d[10] = _mm_unpacklo_epi64(w7, w15);
+  d[11] = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  d[12] = _mm_unpacklo_epi64(w6, w14);
+  d[13] = _mm_unpackhi_epi64(w6, w14);
+  d[14] = _mm_unpacklo_epi64(w7, w15);
+  d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+                               uint8_t *dst, ptrdiff_t pitchDst) {
+  __m128i r[16];
+  __m128i d[16];
+  for (int j = 0; j < 16; j++) {
+    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+  }
+  transpose16x16_sse2(r, d);
+  for (int j = 0; j < 16; j++) {
+    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+  }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+                      ptrdiff_t pitchDst, int width, int height) {
+  for (int j = 0; j < height; j += 16)
+    for (int i = 0; i < width; i += 16)
+      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+                         dst + j * pitchDst + i, pitchDst);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
diff --git a/third_party/aom/aom_dsp/x86/intrapred_x86.h b/third_party/aom/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 0000000000..b13f575a76
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_x86.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h>  // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+#endif  // AOM_AOM_DSP_X86_INTRAPRED_X86_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..0bc841a7a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+  ; a c d b  to  a b c d
+  SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+  ; input:
+  ; m0 a
+  ; m1 b
+  ; m2 c
+  ; m3 d
+  paddw           m0,        m2
+  psubw           m3,        m1
+
+  ; wide subtract
+  punpcklwd       m4,        m0
+  punpcklwd       m5,        m3
+  psrad           m4,        16
+  psrad           m5,        16
+  psubd           m4,        m5
+  psrad           m4,        1
+  packssdw        m4,        m4             ; e
+
+  psubw           m5,        m4,        m1  ; b
+  psubw           m4,        m2             ; c
+  psubw           m0,        m5
+  paddw           m3,        m4
+                                ; m0 a
+  SWAP            1,         5  ; m1 b
+  SWAP            2,         4  ; m2 c
+                                ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+  punpcklwd       m0,        m2
+  punpcklwd       m1,        m3
+  mova            m2,        m0
+  punpcklwd       m0,        m1
+  punpckhwd       m2,        m1
+  pshufd          m1,        m0, 0x0e
+  pshufd          m3,        m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+  mova            m3, m0
+  punpcklwd       m0, m1
+  punpckhwd       m3, m1
+  mova            m2, m0
+  punpcklwd       m0, m3
+  punpckhwd       m2, m3
+  pshufd          m1, m0, 0x0e
+  pshufd          m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5  ; src1, src2, tmp1, tmp2, zero
+  movd            m%3,       [outputq]
+  movd            m%4,       [outputq + strideq]
+  punpcklbw       m%3,       m%5
+  punpcklbw       m%4,       m%5
+  paddw           m%1,       m%3
+  paddw           m%2,       m%4
+  packuswb        m%1,       m%5
+  packuswb        m%2,       m%5
+  movd            [outputq], m%1
+  movd            [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+  mova            m0,        [inputq +  0]
+  packssdw        m0,        [inputq + 16]
+  mova            m1,        [inputq + 32]
+  packssdw        m1,        [inputq + 48]
+  psraw           m0,        2
+  psraw           m1,        2
+
+  TRANSPOSE_4X4_WIDE
+  REORDER_INPUTS
+  TRANSFORM_COLS
+  TRANSPOSE_4X4
+  REORDER_INPUTS
+  TRANSFORM_COLS
+
+  pxor            m4, m4
+  ADD_STORE_4P_2X  0, 1, 5, 6, 4
+  lea             outputq, [outputq + 2 * strideq]
+  ADD_STORE_4P_2X  2, 3, 5, 6, 4
+
+  RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
new file mode 100644
index 0000000000..16d2f4be7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride, int width,
+                                int height) {
+  int i;
+  assert(width == 4);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 4) {
+    __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+    __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+    __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+    __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+    __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    x0 = xx_loadl_32(b + 0 * b_stride);
+    x1 = xx_loadl_32(b + 1 * b_stride);
+    x2 = xx_loadl_32(b + 2 * b_stride);
+    x3 = xx_loadl_32(b + 3 * b_stride);
+    x_lo = _mm_unpacklo_epi32(x0, x1);
+    x_hi = _mm_unpacklo_epi32(x2, x3);
+
+    __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+    __m128i sad4x4 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad4x4);
+
+    a += 4 * a_stride;
+    b += 4 * b_stride;
+  }
+
+  // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride,
+                                const uint8_t *b, int b_stride, int width,
+                                int height) {
+  int i;
+  assert(width == 8);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; i += 2) {
+    __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+    __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+    __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+    x0 = xx_loadl_64(b + 0 * b_stride);
+    x1 = xx_loadl_64(b + 1 * b_stride);
+
+    __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+    __m128i sad8x2 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad8x2);
+
+    a += 2 * a_stride;
+    b += 2 * b_stride;
+  }
+
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int width,
+                                 int height) {
+  int i;
+  assert(width == 16);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    __m128i x = xx_loadu_128(a);
+    __m128i y = xx_loadu_128(b);
+
+    __m128i sad16x1 = _mm_sad_epu8(x, y);
+    sad = _mm_add_epi32(sad, sad16x1);
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int width,
+                                 int height) {
+  int i, j;
+  assert(width == 32);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 2; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad32_half = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad32_half);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, int width,
+                                 int height) {
+  int i, j;
+  assert(width == 64);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 4; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride, int width,
+                                  int height) {
+  int i, j;
+  assert(width == 128);
+  (void)width;
+
+  __m128i sad = _mm_setzero_si128();
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < 8; ++j) {
+      __m128i x = xx_loadu_128(a + j * 16);
+      __m128i y = xx_loadu_128(b + j * 16);
+
+      __m128i sad64_quarter = _mm_sad_epu8(x, y);
+      sad = _mm_add_epi32(sad, sad64_quarter);
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  const unsigned int res =
+      (unsigned int)(_mm_cvtsi128_si32(sad) +
+                     _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+  return res;
+}
+
+#define DIST_WTD_SADMXN_SSE2(m, n)                                            \
+  unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2(                          \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) {    \
+    uint8_t comp_pred[m * n];                                                 \
+    aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+                               jcp_param);                                    \
+    return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n);              \
+  }
+
+DIST_WTD_SADMXN_SSE2(128, 128)
+DIST_WTD_SADMXN_SSE2(128, 64)
+DIST_WTD_SADMXN_SSE2(64, 128)
+DIST_WTD_SADMXN_SSE2(64, 64)
+DIST_WTD_SADMXN_SSE2(64, 32)
+DIST_WTD_SADMXN_SSE2(32, 64)
+DIST_WTD_SADMXN_SSE2(32, 32)
+DIST_WTD_SADMXN_SSE2(32, 16)
+DIST_WTD_SADMXN_SSE2(16, 32)
+DIST_WTD_SADMXN_SSE2(16, 16)
+DIST_WTD_SADMXN_SSE2(16, 8)
+DIST_WTD_SADMXN_SSE2(8, 16)
+DIST_WTD_SADMXN_SSE2(8, 8)
+DIST_WTD_SADMXN_SSE2(8, 4)
+DIST_WTD_SADMXN_SSE2(4, 8)
+DIST_WTD_SADMXN_SSE2(4, 4)
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SADMXN_SSE2(4, 16)
+DIST_WTD_SADMXN_SSE2(16, 4)
+DIST_WTD_SADMXN_SSE2(8, 32)
+DIST_WTD_SADMXN_SSE2(32, 8)
+DIST_WTD_SADMXN_SSE2(16, 64)
+DIST_WTD_SADMXN_SSE2(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 0000000000..dd798ca54a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+                                        const __m128i *w, const __m128i *r,
+                                        void *const result) {
+  __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+  __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+  __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+  __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+  __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+  __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+  __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+  __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+  xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height, const uint8_t *ref,
+                                      int ref_stride,
+                                      const DIST_WTD_COMP_PARAMS *jcp_param) {
+  int i;
+  const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+  const int8_t w1 = (int8_t)jcp_param->bck_offset;
+  const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+                                 w1, w0, w1, w0);
+  const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+  const __m128i r = _mm_set1_epi16(round);
+
+  if (width >= 16) {
+    // Read 16 pixels one row at a time
+    assert(!(width & 15));
+    for (i = 0; i < height; ++i) {
+      int j;
+      for (j = 0; j < width; j += 16) {
+        __m128i p0 = xx_loadu_128(ref);
+        __m128i p1 = xx_loadu_128(pred);
+
+        compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+        comp_pred += 16;
+        pred += 16;
+        ref += 16;
+      }
+      ref += ref_stride - width;
+    }
+  } else if (width >= 8) {
+    // Read 8 pixels two row at a time
+    assert(!(width & 7));
+    assert(!(width & 1));
+    for (i = 0; i < height; i += 2) {
+      __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+      __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+      __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 2 * ref_stride;
+    }
+  } else {
+    // Read 4 pixels four row at a time
+    assert(!(width & 3));
+    assert(!(height & 3));
+    for (i = 0; i < height; i += 4) {
+      const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride;
+      const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride;
+      const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride;
+      const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride;
+
+      __m128i p0 =
+          _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+                        row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+                        row3[0], row3[1], row3[2], row3[3]);
+      __m128i p1 = xx_loadu_128(pred);
+
+      compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+      comp_pred += 16;
+      pred += 16;
+      ref += 4 * ref_stride;
+    }
+  }
+}
+
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H)                                      \
+  uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3(           \
+      const uint8_t *a, int a_stride, int xoffset, int yoffset,            \
+      const uint8_t *b, int b_stride, uint32_t *sse,                       \
+      const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+    uint16_t fdata3[(H + 1) * W];                                          \
+    uint8_t temp2[H * W];                                                  \
+    DECLARE_ALIGNED(16, uint8_t, temp3[H * W]);                            \
+                                                                           \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                           \
+        a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]);   \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                          \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);          \
+                                                                           \
+    aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W,   \
+                                     jcp_param);                           \
+                                                                           \
+    return aom_variance##W##x##H(temp3, W, b, b_stride, sse);              \
+  }
+
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..6e77742e3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "config/aom_dsp_rtcd.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+  0, 128, 1, 128, 2,  128, 3,  128, 4,  128, 5,  128, 6,  128, 7,  128,
+  8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i mask, flat;
+
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+  const __m128i limit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+
+  {
+    __m128i work;
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i hev;
+
+    hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat_p1, flat_p0, flat_q0, flat_q1;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    __m128i work;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixetFilter, add, res;
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+
+      pixetFilter = _mm256_slli_epi16(
+          _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
+      pixetFilter =
+          _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
+      pixetFilter = _mm256_add_epi16(four, pixetFilter);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      p1 = _mm_andnot_si128(flat, ps1);
+      flat_p1 = _mm_and_si128(flat, flat_p1);
+      p1 = _mm_or_si128(flat_p1, p1);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
+                             _mm256_sub_epi16(q256_0, p256_2));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      p0 = _mm_andnot_si128(flat, ps0);
+      flat_p0 = _mm_and_si128(flat, flat_p0);
+      p0 = _mm_or_si128(flat_p0, p0);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
+                             _mm256_sub_epi16(q256_1, p256_1));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      q0 = _mm_andnot_si128(flat, qs0);
+      flat_q0 = _mm_and_si128(flat, flat_q0);
+      q0 = _mm_or_si128(flat_q0, q0);
+
+      add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
+                             _mm256_sub_epi16(q256_2, p256_0));
+      pixetFilter = _mm256_add_epi16(pixetFilter, add);
+      res = _mm256_srli_epi16(pixetFilter, 3);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+      q1 = _mm_andnot_si128(flat, qs1);
+      flat_q1 = _mm_and_si128(flat, flat_q1);
+      q1 = _mm_or_si128(flat_q1, q1);
+
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, flat;
+
+  const __m128i thresh_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+  const __m128i limit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+  const __m128i blimit_v =
+      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  p256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+  p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+  q256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+  p3 = _mm256_castsi256_si128(p256_3);
+  p2 = _mm256_castsi256_si128(p256_2);
+  p1 = _mm256_castsi256_si128(p256_1);
+  p0 = _mm256_castsi256_si128(p256_0);
+  q0 = _mm256_castsi256_si128(q256_0);
+  q1 = _mm256_castsi256_si128(q256_1);
+  q2 = _mm256_castsi256_si128(q256_2);
+  q3 = _mm256_castsi256_si128(q256_3);
+
+  {
+    __m128i work;
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i one = _mm_set1_epi8(1);
+    __m128i hev;
+
+    hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+    __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    __m128i work;
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+        _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+        _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i four = _mm256_set1_epi16(4);
+      __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
+          sum_q, res_p, res_q;
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+      p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+      pixetFilter_p2p1p0 =
+          _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+      pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+      flat_p0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p0 = _mm_andnot_si128(flat, ps0);
+      flat_p0 = _mm_and_si128(flat, flat_p0);
+      p0 = _mm_or_si128(flat_p0, p0);
+
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+      flat_q0 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q0 = _mm_andnot_si128(flat, qs0);
+      flat_q0 = _mm_and_si128(flat, flat_q0);
+      q0 = _mm_or_si128(flat_q0, q0);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_2);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+      flat_p1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p1 = _mm_andnot_si128(flat, ps1);
+      flat_p1 = _mm_and_si128(flat, flat_p1);
+      p1 = _mm_or_si128(flat_p1, p1);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_2);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+      flat_q1 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q1 = _mm_andnot_si128(flat, qs1);
+      flat_q1 = _mm_and_si128(flat, flat_q1);
+      q1 = _mm_or_si128(flat_q1, q1);
+
+      sum_p = _mm256_sub_epi16(p256_3, q256_1);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+      flat_p2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+      p2 = _mm_andnot_si128(flat, p2);
+      flat_p2 = _mm_and_si128(flat, flat_p2);
+      p2 = _mm_or_si128(flat_p2, p2);
+
+      sum_q = _mm256_sub_epi16(q256_3, p256_1);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+      flat_q2 = _mm256_castsi256_si128(
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+      q2 = _mm_andnot_si128(flat, q2);
+      flat_q2 = _mm_and_si128(flat, flat_q2);
+      q2 = _mm_or_si128(flat_q2, q2);
+
+      _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
+
+static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
+                                                unsigned char *out, int out_p,
+                                                int is_store_avx2) {
+  const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
+  const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
+  const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
+  const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
+  const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
+  const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
+  const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
+  const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
+
+  const __m256i y0 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
+      0x1);
+  const __m256i y1 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
+      0x1);
+  const __m256i y2 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
+      0x1);
+  const __m256i y3 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
+      0x1);
+  const __m256i y4 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
+      0x1);
+  const __m256i y5 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
+      0x1);
+  const __m256i y6 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
+      0x1);
+  const __m256i y7 = _mm256_insertf128_si256(
+      _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
+      0x1);
+
+  const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
+  const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
+  const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
+  const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
+  const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
+  const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
+  const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
+  const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
+
+  const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
+  const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
+  const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
+  const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
+  const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
+  const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
+  const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
+  const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
+
+  const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
+  const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
+  const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
+  const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
+  const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
+  const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
+  const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
+  const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
+
+  const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
+  const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
+  const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
+  const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
+  const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
+  const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
+  const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
+  const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
+
+  if (is_store_avx2) {
+    _mm256_storeu_si256((__m256i *)(out), row_s01);
+    _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
+    _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
+    _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
+    _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
+    _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
+    _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
+    _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
+  } else {
+    _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
+    _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
+                     _mm256_castsi256_si128(row_s23));
+    _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
+                     _mm256_castsi256_si128(row_s45));
+    _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
+                     _mm256_castsi256_si128(row_s67));
+    _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
+                     _mm256_castsi256_si128(row_s89));
+    _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
+                     _mm256_castsi256_si128(row_s1011));
+    _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
+                     _mm256_castsi256_si128(row_s1213));
+    _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
+                     _mm256_castsi256_si128(row_s1415));
+    _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
+                     _mm256_extracti128_si256(row_s01, 1));
+    _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
+                     _mm256_extracti128_si256(row_s23, 1));
+    _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
+                     _mm256_extracti128_si256(row_s45, 1));
+    _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
+                     _mm256_extracti128_si256(row_s67, 1));
+    _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
+                     _mm256_extracti128_si256(row_s89, 1));
+    _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
+                     _mm256_extracti128_si256(row_s1011, 1));
+    _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
+                     _mm256_extracti128_si256(row_s1213, 1));
+    _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
+                     _mm256_extracti128_si256(row_s1415, 1));
+  }
+}
+
+void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0) {
+  __m128i mask, flat;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+  __m256i p256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+  __m256i p256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+  __m256i p256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+  __m256i p256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+  __m256i q256_0 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+  __m256i q256_1 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+  __m256i q256_2 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+  __m256i q256_3 =
+      _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+  __m128i p3 = _mm256_castsi256_si128(p256_3);
+  __m128i p2 = _mm256_castsi256_si128(p256_2);
+  __m128i p1 = _mm256_castsi256_si128(p256_1);
+  __m128i p0 = _mm256_castsi256_si128(p256_0);
+  __m128i q0 = _mm256_castsi256_si128(q256_0);
+  __m128i q1 = _mm256_castsi256_si128(q256_1);
+  __m128i q2 = _mm256_castsi256_si128(q256_2);
+  __m128i q3 = _mm256_castsi256_si128(q256_3);
+
+  {
+    const __m128i limit_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+    const __m128i blimit_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i abs_p1p0 =
+        _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 =
+        _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+    __m128i abs_p0q0 =
+        _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 =
+        _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    __m128i work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+        _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(
+        _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+        _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // loop filter
+  {
+    const __m128i thresh_v =
+        _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t4 = _mm_add_epi8(one, t3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t7f = _mm_sub_epi8(t80, one);
+
+    __m128i hev = _mm_subs_epu8(flat, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    __m128i ps1 = _mm_xor_si128(p1, t80);
+    __m128i ps0 = _mm_xor_si128(p0, t80);
+    __m128i qs0 = _mm_xor_si128(q0, t80);
+    __m128i qs1 = _mm_xor_si128(q1, t80);
+
+    __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    __m128i work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+
+    __m128i filter1 = _mm_adds_epi8(filt, t4);
+    __m128i filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, one);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+    // Derive flat
+    __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
+    __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
+    __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
+    const __m256i ps0qs0256 =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
+    const __m256i ps1qs1256 =
+        _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
+    const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
+                                           _mm256_subs_epu8(p0q0256, p2q2256));
+    const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
+                                           _mm256_subs_epu8(p0q0256, p3q3256));
+    const __m256i max0_256 = _mm256_max_epu8(work01, work02);
+    const __m128i max1_256 =
+        _mm_max_epu8(_mm256_castsi256_si128(max0_256),
+                     _mm256_extractf128_si256(max0_256, 1));
+    flat = _mm_max_epu8(max1_256, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m256i flat256 =
+          _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
+      const __m256i eight = _mm256_set1_epi16(8);
+      const __m256i four = _mm256_set1_epi16(4);
+
+      __m256i p256_4 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+      __m256i q256_4 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+      __m256i p256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+      __m256i q256_5 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+      __m256i p256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+      __m256i q256_6 = _mm256_castpd_si256(
+          _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+
+      // Derive flat2
+      __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
+      __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
+      const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
+      const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p4q4256));
+      const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p5q5256));
+      const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
+                                            _mm256_subs_epu8(p0q0256, p6q6256));
+      __m256i flat2_256 = _mm256_max_epu8(work1, work2);
+      flat2_256 = _mm256_max_epu8(flat2_256, work3);
+      __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
+                                   _mm256_extractf128_si256(flat2_256, 1));
+      flat2 = _mm_subs_epu8(flat2, one);
+      flat2 = _mm_cmpeq_epi8(flat2, zero);
+      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+
+      const __m256i filter =
+          _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+      p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+      p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+      p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+      p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+      q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+      q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+      q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+      q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+      const __m256i p2p1p0 =
+          _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+      const __m256i q2q1q0 =
+          _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+      __m256i pixetFilter_p2p1p0 =
+          _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+      __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+      // Derive p0 and q0
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+      __m256i res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+      __m256i res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+      __m256i flat_p0q0 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
+      flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
+      p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
+      p0 = _mm256_castsi256_si128(p0q0256);
+      q0 = _mm256_extractf128_si256(p0q0256, 1);
+
+      // Derive p1 and q1
+      __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+      __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+      __m256i flat_p1q1 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
+      flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
+      p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
+      p1 = _mm256_castsi256_si128(p1q1256);
+      q1 = _mm256_extractf128_si256(p1q1256, 1);
+
+      // Derive p2 and q2
+      sum_p = _mm256_sub_epi16(p256_3, q256_1);
+      pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+      res_p =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+      sum_q = _mm256_sub_epi16(q256_3, p256_1);
+      pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+      res_q =
+          _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+      __m256i flat_p2q2 =
+          _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+      p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
+      flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
+      p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
+      p2 = _mm256_castsi256_si128(p2q2256);
+      q2 = _mm256_extractf128_si256(p2q2256, 1);
+      if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+        flat2_256 =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
+        p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+        p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+        p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+        q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+        q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+        q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+
+        __m256i pixelFilter_p =
+            _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
+        __m256i pixelFilter_q =
+            _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
+
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
+
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
+        pixelFilter_p = _mm256_add_epi16(
+            eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+        pixelFilter_q = pixelFilter_p;
+
+        // Derive p0 and q0
+        pixelFilter_p =
+            _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        pixelFilter_q =
+            _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p0q0 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
+        flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
+        p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
+
+        p0 = _mm256_castsi256_si128(p0q0256);
+        q0 = _mm256_extractf128_si256(p0q0256, 1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+        // Derive p1 and q1
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
+                                 _mm256_sub_epi16(p256_2, q256_0));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
+                                 _mm256_sub_epi16(q256_2, p256_0));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p1q1 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
+        flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
+        p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
+        p1 = _mm256_castsi256_si128(p1q1256);
+        q1 = _mm256_extractf128_si256(p1q1256, 1);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+        // Derive p2 and q2
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
+                                 _mm256_sub_epi16(p256_3, p256_0));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
+                                 _mm256_sub_epi16(q256_3, q256_0));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p2q2 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
+        flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
+        p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
+        p2 = _mm256_castsi256_si128(p2q2256);
+        q2 = _mm256_extractf128_si256(p2q2256, 1);
+        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+        // Derive p3 and q3
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
+                                 _mm256_sub_epi16(p256_4, p256_1));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
+                                 _mm256_sub_epi16(q256_4, q256_1));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p3q3 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
+        flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
+        p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
+        p3 = _mm256_castsi256_si128(p3q3256);
+        q3 = _mm256_extractf128_si256(p3q3256, 1);
+        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+        _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+        // Derive p4 and q4
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
+                                 _mm256_sub_epi16(p256_5, p256_2));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
+                                 _mm256_sub_epi16(q256_5, q256_2));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p4q4 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
+        flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
+        p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
+        _mm_storeu_si128((__m128i *)(s - 5 * p),
+                         _mm256_castsi256_si128(p4q4256));
+        _mm_storeu_si128((__m128i *)(s + 4 * p),
+                         _mm256_extractf128_si256(p4q4256, 1));
+
+        // Derive p5 and q5
+        sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
+                                 _mm256_sub_epi16(p256_6, p256_3));
+        pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+        res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+        sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
+                                 _mm256_sub_epi16(q256_6, q256_3));
+        pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+        res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+        __m256i flat2_p5q5 =
+            _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+        p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
+        flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
+        p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
+        _mm_storeu_si128((__m128i *)(s - 6 * p),
+                         _mm256_castsi256_si128(p5q5256));
+        _mm_storeu_si128((__m128i *)(s + 5 * p),
+                         _mm256_extractf128_si256(p5q5256, 1));
+      } else {
+        _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+      }
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+    }
+  }
+}
+
+void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
+                                   const uint8_t *_blimit0,
+                                   const uint8_t *_limit0,
+                                   const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
+
+  // Loop filtering
+  aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
+                                  _thresh0);
+
+  // Transpose back
+  trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
+}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 0000000000..cdf24c332a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,2973 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+  return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8  independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                        __m128i *x3, __m128i *q0p0,
+                                        __m128i *q1p1, __m128i *q2p2,
+                                        __m128i *q3p3, __m128i *q4p4,
+                                        __m128i *q5p5, __m128i *q6p6,
+                                        __m128i *q7p7) {
+  __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi8(
+      *x0, *x1);  // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+  w3 = _mm_unpackhi_epi8(
+      *x2, *x3);  // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31        02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35        06 16 26 36 07 17 27 37
+  ww2 = _mm_unpacklo_epi16(
+      w2, w3);  // 08 18 28 38 09 19 29 39       010 110 210 310 011 111 211 311
+  ww3 = _mm_unpackhi_epi16(
+      w2,
+      w3);  // 012 112 212 312 013 113 213 313  014 114 214 314 015 115 215 315
+
+  *q7p7 = _mm_unpacklo_epi32(
+      ww0,
+      _mm_srli_si128(
+          ww3, 12));  // 00 10 20 30  015 115 215 315  xx xx xx xx xx xx xx xx
+  *q6p6 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww0, 4),
+      ww3);  // 01 11 21 31  014 114 214 314  xx xx xx xxxx xx xx xx
+  *q5p5 = _mm_unpackhi_epi32(
+      ww0,
+      _mm_slli_si128(
+          ww3, 4));  // 02 12 22 32  013 113 213 313  xx xx xx x xx xx xx xxx
+  *q4p4 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww0, 12),
+      ww3);  // 03 13 23 33  012 112 212 312 xx xx xx xx xx xx xx xx
+  *q3p3 = _mm_unpacklo_epi32(
+      ww1,
+      _mm_srli_si128(
+          ww2, 12));  // 04 14 24 34  011 111 211 311 xx xx xx xx xx xx xx xx
+  *q2p2 = _mm_unpackhi_epi32(
+      _mm_slli_si128(ww1, 4),
+      ww2);  // 05 15 25 35   010 110 210 310 xx xx xx xx xx xx xx xx
+  *q1p1 = _mm_unpackhi_epi32(
+      ww1,
+      _mm_slli_si128(
+          ww2, 4));  // 06 16 26 36   09 19 29 39     xx xx xx xx xx xx xx xx
+  *q0p0 = _mm_unpacklo_epi32(
+      _mm_srli_si128(ww1, 12),
+      ww2);  // 07 17 27 37  08 18 28 38     xx xx xx xx xx xx xx xx
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them  independently while flipping the second matrix horizontaly  Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *x6, __m128i *x7,
+                                            __m128i *pq0, __m128i *pq1,
+                                            __m128i *pq2, __m128i *pq3) {
+  __m128i w10, w11, w12, w13;
+  __m128i w0, w1, w2, w3, w4, w5;
+  __m128i d0, d1, d2, d3;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w10 = _mm_unpacklo_epi8(
+      *x7, *x6);  // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+  w11 = _mm_unpacklo_epi8(
+      *x5, *x4);  // q  xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+  w12 = _mm_unpacklo_epi8(
+      *x3, *x2);  // q  xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+  w13 = _mm_unpacklo_epi8(
+      *x1, *x0);  // q  xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+  w4 = _mm_unpackhi_epi16(
+      w10, w11);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpackhi_epi16(
+      w12, w13);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  *pq0 = _mm_unpacklo_epi64(d0, d1);  // pq
+  *pq1 = _mm_unpackhi_epi64(d0, d1);  // pq
+  *pq2 = _mm_unpacklo_epi64(d2, d3);  // pq
+  *pq3 = _mm_unpackhi_epi64(d2, d3);  // pq
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+                                          __m128i *hev, __m128i *mask,
+                                          __m128i *qs1qs0, __m128i *ps1ps0) {
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i t3t4 =
+      _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter2filter1 =
+      _mm_unpacklo_epi8(filter2filter1, filter2filter1);  // goto 16 bit
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11);    /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff);  /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);  // goto 16 bit
+  filter = _mm_srai_epi16(filter, 9);          /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+  filter = _mm_unpacklo_epi32(filter, filter);
+
+  filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+  hev1 = _mm_srli_si128(filter2filter1, 8);
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+                                               __m128i *hev, __m128i *mask,
+                                               __m128i *qs1qs0,
+                                               __m128i *ps1ps0) {
+  const __m128i t3t4 =
+      _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+  const __m128i t80 = _mm_set1_epi8((char)0x80);
+  __m128i filter, filter2filter1, work;
+  __m128i ps1ps0_work, qs1qs0_work;
+  __m128i hev1;
+  const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+  ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+  qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+  /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+  work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+  filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+  /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);
+  filter = _mm_subs_epi8(filter, work);  /* + 3 * (qs0 - ps0) */
+  filter = _mm_and_si128(filter, *mask); /* & mask */
+  filter = _mm_unpacklo_epi64(filter, filter);
+
+  /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+  /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+  filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+  filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+  filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+  filter = _mm_srai_epi16(filter, 11);                 /* >> 3 */
+  filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+  /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+  filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+  filter = _mm_unpacklo_epi8(filter, filter);
+  filter = _mm_srai_epi16(filter, 9); /* round */
+  filter = _mm_packs_epi16(filter, filter);
+  filter = _mm_andnot_si128(*hev, filter);
+
+  hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+  filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+  /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+  qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+  /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+  ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+  *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+  *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, flat, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4);           /* abs(p1 - q1) */
+  abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi32(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+    __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+    __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+  __m128i q1p1, q0p0, p1p0, q1q0;
+  __m128i abs_p0q0, abs_p1q1;
+  __m128i mask, hev;
+  const __m128i zero = _mm_setzero_si128();
+
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  /* (abs(q1 - q0), abs(p1 - p0) */
+  __m128i flat = abs_diff(q1p1, q0p0);
+  /* abs(p1 - q1), abs(p0 - q0) */
+  const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+  /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  hev = _mm_unpacklo_epi8(flat, zero);
+
+  hev = _mm_cmpgt_epi16(hev, *thresh);
+  hev = _mm_packs_epi16(hev, hev);
+
+  /* const int8_t mask = filter_mask2(*limit, *blimit, */
+  /*                                  p1, p0, q0, q1); */
+  abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+  abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+  abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+  abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+  /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+  mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+  mask = _mm_unpacklo_epi64(mask, flat);
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+  mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+                               const uint8_t *_blimit, const uint8_t *_limit,
+                               const uint8_t *_thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i qs1qs0, ps1ps0;
+  __m128i p1, p0, q0, q1;
+
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+  xx_storel_32(s - 1 * p, ps1ps0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
+  xx_storel_32(s + 0 * p, qs1qs0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+                             const uint8_t *_blimit, const uint8_t *_limit,
+                             const uint8_t *_thresh) {
+  __m128i p1p0, q1q0;
+  __m128i p1, p0, q0, q1;
+
+  const __m128i zero = _mm_setzero_si128();
+  __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+                                     _mm_loadl_epi64((const __m128i *)_limit));
+  __m128i thresh =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+  __m128i x0, x1, x2, x3;
+  __m128i d0, d1, d2, d3;
+  x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+  transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+  lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
+
+  // Transpose 8x4 to 4x8
+  p1 = _mm_srli_si128(p1p0, 4);
+  q1 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+  xx_storel_32(s - (num + 1) * p, x);
+  xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+  q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
+
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    __m128i fe, ff, work;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+    fe = _mm_set1_epi8((char)0xfe);
+    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+  // loopfilter done
+
+  __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+  __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+  __m128i work;
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+    __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pixelFilter_p, pixelFilter_q;
+    __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+    __m128i sum_p6, sum_q6;
+    __m128i sum_p3, sum_q3, res_p, res_q;
+
+    p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+    p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+    p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+    p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+    p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+    q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+    q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+    q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+    q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+    q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+    q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+    q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+    pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+    pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+    pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+    pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+    pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+    pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+    pixelFilter_p =
+        _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+    pixetFilter_p2p1p0 = _mm_add_epi16(
+        four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+                                    _mm_add_epi16(p1_16, q0_16))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixelFilter_p,
+                      _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+                                    _mm_add_epi16(p0_16, q1_16))),
+        4);
+    flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+    flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+    sum_p6 = _mm_add_epi16(p6_16, p6_16);
+    sum_q6 = _mm_add_epi16(q6_16, q6_16);
+    sum_p3 = _mm_add_epi16(p3_16, p3_16);
+    sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+    pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+    pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_p,
+            _mm_add_epi16(sum_p6,
+                          _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+        4);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(
+            pixelFilter_q,
+            _mm_add_epi16(sum_q6,
+                          _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+        4);
+    flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+    flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+    pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+    pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+    sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+    sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+    res_p = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+    res_q = _mm_srli_epi16(
+        _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+    flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+    // work with flat2
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat = _mm_unpacklo_epi64(flat, flat);
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+    *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+          4);
+      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+          4);
+      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+          4);
+      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+      sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+      sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+      res_p = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_p,
+              _mm_add_epi16(sum_p6,
+                            _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+          4);
+      res_q = _mm_srli_epi16(
+          _mm_add_epi16(
+              pixelFilter_q,
+              _mm_add_epi16(sum_q6,
+                            _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+          4);
+      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+    __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+    __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  __m128i mask, hev, flat, flat2;
+  __m128i flat2_pq[6], flat_pq[3];
+  __m128i qs0ps0, qs1ps1;
+  __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+  __m128i abs_p1p0;
+
+  p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  __m128i fe, ff, work;
+  {
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+    abs_p1p0 = abs_diff(*q1p1, *q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+    fe = _mm_set1_epi8((char)0xfe);
+    ff = _mm_cmpeq_epi8(fe, fe);
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+  qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+  qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+  // loopfilter done
+
+  flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // if flat ==0 then flat2 is zero as well and we don't need any calc below
+  // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // flat and wide flat calculations
+    __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+    __m128i pq_16[7];
+    const __m128i eight = _mm_set1_epi16(8);
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i sum_p6;
+    __m128i sum_p3;
+
+    pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+    pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+    pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+    pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+    pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+    pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+    pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+    q0_16 = _mm_srli_si128(pq_16[0], 8);
+    q1_16 = _mm_srli_si128(pq_16[1], 8);
+    q2_16 = _mm_srli_si128(pq_16[2], 8);
+    q3_16 = _mm_srli_si128(pq_16[3], 8);
+    q4_16 = _mm_srli_si128(pq_16[4], 8);
+    q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+    __m128i flat_p[3], flat_q[3];
+    __m128i flat2_p[6], flat2_q[6];
+
+    __m128i work0, work0_0, work0_1, sum_p_0;
+    __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+    __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+    sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+    __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+    __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+    sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+    sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+    flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+    flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+    sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+    sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+    sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+    sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+    work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+    work0_1 = _mm_add_epi16(
+        sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+    sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+    sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+    work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+    flat_p[1] = _mm_add_epi16(sum_lp, work0);
+    flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+    flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+    flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+    flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+    flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+    sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+    sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+    sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+    work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+    flat_p[2] = _mm_add_epi16(sum_lp, work0);
+    flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+    flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+    flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+
+    work = abs_diff(*q6p6, *q0p0);
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+    flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+    // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+    flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+    *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+    flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+    *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+    *q2p2 = _mm_andnot_si128(flat, *q2p2);
+    flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+    *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+      flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+      flat2_q[0] = _mm_add_epi16(
+          sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+      flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+      flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+      flat2_pq[0] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+      flat2_pq[1] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+      flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+      flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+      sum_p = _mm_sub_epi16(sum_p, q4_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+      flat2_p[2] = _mm_add_epi16(sum_p, work0);
+      flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[2] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+      flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q3_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+      flat2_p[3] = _mm_add_epi16(sum_p, work0);
+      flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[3] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+      flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q2_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+      flat2_p[4] = _mm_add_epi16(sum_p, work0);
+      flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[4] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+      flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+      sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+      sum_p = _mm_sub_epi16(sum_p, q1_16);
+      sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+      work0 = _mm_add_epi16(
+          sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+      flat2_p[5] = _mm_add_epi16(sum_p, work0);
+      flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+      flat2_pq[5] =
+          _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+      flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+      // wide flat
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+      *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+      flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+      *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+      *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+      flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+      *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+      *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+      flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+      *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+      *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+      flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+      *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+      *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+      flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+      *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+      *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+      flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+      *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+    }
+  } else {
+    *q0p0 = qs0ps0;
+    *q1p1 = qs1ps1;
+  }
+}
+
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+                                const unsigned char *_blimit,
+                                const unsigned char *_limit,
+                                const unsigned char *_thresh) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
+  q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
+  q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
+  q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
+
+  q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
+
+  q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
+
+  q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  store_buffer_horz_8(q0p0, p, 0, s);
+  store_buffer_horz_8(q1p1, p, 1, s);
+  store_buffer_horz_8(q2p2, p, 2, s);
+  store_buffer_horz_8(q3p3, p, 3, s);
+  store_buffer_horz_8(q4p4, p, 4, s);
+  store_buffer_horz_8(q5p5, p, 5, s);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+    // op1
+    workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+                            _mm_add_epi16(p1_16, p1_16));  // p0 *2 + p1 * 2
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+                            p2_16);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_b);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+                            p1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_add_epi16(q1_16, q2_16);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+                            p0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+                                 3);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+    __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+    __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+    __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+  __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+  __m128i ps1ps0, qs1qs0;
+
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+  *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  {
+    // filter_mask and hev_mask
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+    abs_p0q0 = abs_diff(*p1p0, *q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+    // considering sse doesn't have unsigned elements comparison the idea is
+    // to find at least one case when X > limit, it means the corresponding
+    // mask bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi32(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_unpacklo_epi32(mask, zero);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = abs_diff(q2p2, q1p1);
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+    // flat_mask
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi32(flat, flat);
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // 5 tap filter
+  // need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c;
+    __m128i pq0x2_pq1, pq1_pq2;
+    pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+    pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+    pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+    q0_16 = _mm_srli_si128(pq0_16, 8);
+    q2_16 = _mm_srli_si128(pq2_16, 8);
+
+    // op1
+    pq0x2_pq1 =
+        _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16);  // p0 *2 + p1
+    pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16);                   // p1 + p2
+    workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+                            pq1_pq2);  // p2 + p0 * 2 + p1 * 2 + 4
+
+    workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+    workp_b =
+        _mm_add_epi16(workp_a, workp_b);  // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+    // op0
+    workp_c = _mm_srli_si128(pq0x2_pq1, 8);  // q0 * 2 + q1
+    workp_a = _mm_add_epi16(workp_a,
+                            workp_c);  // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+    workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_b = _mm_srli_epi16(workp_b, 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+    // oq0
+    workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+                            pq1_16);  // p0 * 2 + p1  + q0 * 2 + q1 + 4
+    workp_b = _mm_srli_si128(pq1_pq2, 8);
+    workp_a = _mm_add_epi16(
+        workp_a, workp_b);  // p0 * 2 + p1  + q0 * 2 + q1 * 2 + q2 + 4
+    // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+    // oq1
+    workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+                            pq0_16);  // p0   + q0 * 2 + q1 * 2 + q2 + 4
+    workp_b = _mm_add_epi16(q2_16, q2_16);
+    workp_b =
+        _mm_add_epi16(workp_c, workp_b);  // p0  + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+    workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+    workp_a = _mm_srli_epi16(workp_a, 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+    *q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+    *p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+  }
+}
+
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
+
+  lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2;
+  __m128i p1p0, q1q0;
+
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+  lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+                           &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);  // p1p0 q1q0
+  q1q0 = _mm_srli_si128(p1p0, 8);
+
+  // filter_mask and hev_mask
+
+  // considering sse doesn't have unsigned elements comparison the idea is to
+  // find at least one case when X > limit, it means the corresponding  mask
+  // bit is set.
+  // to achieve that we find global max value of all inputs of abs(x-y) or
+  // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+  // otherwise - not
+
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i fe = _mm_set1_epi8((char)0xfe);
+  const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+  __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+  abs_p1p0 = abs_diff(q1p1, q0p0);
+  abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+  abs_p0q0 = abs_diff(p1p0, q1q0);
+  abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+  flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+  hev = _mm_subs_epu8(flat, *thresh);
+  hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+  // replicate for the further "merged variables" usage
+  hev = _mm_unpacklo_epi32(hev, hev);
+
+  abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+  abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+  mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+  mask = _mm_unpacklo_epi32(mask, zero);
+  mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+  // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+  mask = _mm_max_epu8(abs_p1p0, mask);
+  // mask |= (abs(p1 - p0) > limit) * -1;
+  // mask |= (abs(q1 - q0) > limit) * -1;
+
+  work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+  mask = _mm_max_epu8(work, mask);
+  mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+  mask = _mm_subs_epu8(mask, *limit);
+  mask = _mm_cmpeq_epi8(mask, zero);
+
+  // lp filter - the same for 6, 8 and 14 versions
+  filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+  // flat_mask4
+  flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+  flat = _mm_max_epu8(abs_p1p0, flat);
+
+  flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+  flat = _mm_subs_epu8(flat, one);
+  flat = _mm_cmpeq_epi8(flat, zero);
+  flat = _mm_and_si128(flat, mask);
+  // replicate for the further "merged variables" usage
+  flat = _mm_unpacklo_epi32(flat, flat);
+  flat = _mm_unpacklo_epi64(flat, flat);
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+    __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+    workp_c = _mm_add_epi16(workp_a, workp_b);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_d = _mm_add_epi16(workp_a, workp_b);
+    // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+    flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+    workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+    workp_c = _mm_srli_epi16(workp_c, 3);
+
+    opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 4);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+    __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+    __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+    __m128i *blimit, __m128i *limit, __m128i *thresh) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i mask, hev, flat;
+  __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+      flat_p1p0, flat_q0q1;
+  __m128i q2p2, q1p1, q0p0;
+  __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+  __m128i work_pq, opq2, pq2;
+
+  q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+  q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+  q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+  q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+  p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+  q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+  {
+    // filter_mask and hev_mask
+
+    // considering sse doesn't have unsigned elements comparison the idea is to
+    // find at least one case when X > limit, it means the corresponding  mask
+    // bit is set.
+    // to achieve that we find global max value of all inputs of abs(x-y) or
+    // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+    // otherwise - not
+
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8((char)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+    abs_p1p0 = abs_diff(q1p1, q0p0);
+    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = abs_diff(p1p0, q1q0);
+    abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+    abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, *thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    // replicate for the further "merged variables" usage
+    hev = _mm_unpacklo_epi64(hev, hev);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(abs_p1p0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+
+    work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+    mask = _mm_subs_epu8(mask, *limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // lp filter - the same for 6, 8 and 14 versions
+    filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+    // flat_mask4
+    flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    // replicate for the further "merged variables" usage
+    flat = _mm_unpacklo_epi64(flat, flat);
+  }
+
+  // filter8 need it only if flat !=0
+  if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+    const __m128i four = _mm_set1_epi16(4);
+
+    __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
+    p2_16 = _mm_unpacklo_epi8(*p2, zero);
+    p1_16 = _mm_unpacklo_epi8(*p1, zero);
+    p0_16 = _mm_unpacklo_epi8(*p0, zero);
+    q0_16 = _mm_unpacklo_epi8(*q0, zero);
+    q1_16 = _mm_unpacklo_epi8(*q1, zero);
+    q2_16 = _mm_unpacklo_epi8(*q2, zero);
+    p3_16 = _mm_unpacklo_epi8(*p3, zero);
+    q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+    // op2
+    workp_a =
+        _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+    workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+    workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op1
+    workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // op0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+    // oq0
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+    workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    // oq1
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+    // oq2
+    workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+    workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+    workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+    opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
+
+    work_pq = _mm_andnot_si128(flat, q2p2);
+    pq2 = _mm_and_si128(flat, opq2);
+    *p2 = _mm_or_si128(work_pq, pq2);
+    *q2 = _mm_srli_si128(*p2, 8);
+
+    qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+    q1q0 = _mm_and_si128(flat, flat_q0q1);
+    *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+    ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+    p1p0 = _mm_and_si128(flat, flat_p1p0);
+    *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+  }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+                               const unsigned char *_blimit,
+                               const unsigned char *_limit,
+                               const unsigned char *_thresh) {
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  p3 = xx_loadl_32(s - 4 * p);
+  p2 = xx_loadl_32(s - 3 * p);
+  p1 = xx_loadl_32(s - 2 * p);
+  p0 = xx_loadl_32(s - 1 * p);
+  q0 = xx_loadl_32(s - 0 * p);
+  q1 = xx_loadl_32(s + 1 * p);
+  q2 = xx_loadl_32(s + 2 * p);
+  q3 = xx_loadl_32(s + 3 * p);
+
+  lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  xx_storel_32(s - 1 * p, p1p0);
+  xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+  xx_storel_32(s + 0 * p, q1q0);
+  xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+  xx_storel_32(s - 3 * p, p2);
+  xx_storel_32(s + 2 * p, q2);
+}
+
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0,
+                                     const unsigned char *_blimit1,
+                                     const unsigned char *_limit1,
+                                     const unsigned char *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+
+  q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+  q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+  _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+  _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+  _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+  _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+  _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+  _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+  _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                    const uint8_t *_limit0,
+                                    const uint8_t *_thresh0,
+                                    const uint8_t *_blimit1,
+                                    const uint8_t *_limit1,
+                                    const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+  __m128i q1q0, p1p0;
+
+  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+  _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+  _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0,
+                                    const unsigned char *_blimit1,
+                                    const unsigned char *_limit1,
+                                    const unsigned char *_thresh1) {
+  __m128i p1, p0, q0, q1;
+  __m128i qs1qs0, ps1ps0;
+
+  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+  _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+  _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+  _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i p0, q0, q1, p1;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i qs1qs0, ps1ps0;
+
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+
+  __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+  __m128i thresh0 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+  __m128i thresh1 =
+      _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+  __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
+
+  transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+                        &q1);
+
+  lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+  p1 = _mm_srli_si128(ps1ps0, 8);
+  q1 = _mm_srli_si128(qs1qs0, 8);
+
+  transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+                        &d5, &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x2, x1, x0, x3;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+
+  lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+                      &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+  xx_storel_32(s + 0 * p - 2, d0);
+  xx_storel_32(s + 1 * p - 2, d1);
+  xx_storel_32(s + 2 * p - 2, d2);
+  xx_storel_32(s + 3 * p - 2, d3);
+}
+
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i p0, q0;
+  __m128i p1p0, q1q0;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+                           &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 8);
+  q0 = _mm_srli_si128(q1q0, 8);
+
+  transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+                        &d6, &d7);
+
+  xx_storel_32((s - 2 + 0 * p), d0);
+  xx_storel_32((s - 2 + 1 * p), d1);
+  xx_storel_32((s - 2 + 2 * p), d2);
+  xx_storel_32((s - 2 + 3 * p), d3);
+  xx_storel_32((s - 2 + 4 * p), d4);
+  xx_storel_32((s - 2 + 5 * p), d5);
+  xx_storel_32((s - 2 + 6 * p), d6);
+  xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+                             const unsigned char *_blimit,
+                             const unsigned char *_limit,
+                             const unsigned char *_thresh) {
+  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+  __m128i p0, q0;
+  __m128i x2, x1, x0, x3;
+  __m128i q1q0, p1p0;
+  __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+  x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+  x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+  x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+  x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+  transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+                        &d7);
+  // Loop filtering
+  lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+                      &blimit, &limit, &thresh);
+
+  p0 = _mm_srli_si128(p1p0, 4);
+  q0 = _mm_srli_si128(q1q0, 4);
+
+  transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
+                        &d2, &d3);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0,
+                                  const uint8_t *_blimit1,
+                                  const uint8_t *_limit1,
+                                  const uint8_t *_thresh1) {
+  __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+                                      _mm_load_si128((__m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+                                     _mm_load_si128((__m128i *)_limit1));
+  __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+                                      _mm_load_si128((__m128i *)_thresh1));
+
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i d1, d3, d5, d7;
+  __m128i q1q0, p1p0;
+  __m128i p1, q1;
+  __m128i d0d1, d2d3, d4d5, d6d7;
+
+  x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+  x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+  x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+  x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+  x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+  x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+  x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+  x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+  transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+                    &d6d7);
+
+  d1 = _mm_srli_si128(d0d1, 8);
+  d3 = _mm_srli_si128(d2d3, 8);
+  d5 = _mm_srli_si128(d4d5, 8);
+  d7 = _mm_srli_si128(d6d7, 8);
+
+  lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+                           &q1q0, &p1p0, &blimit, &limit, &thresh);
+
+  p1 = _mm_srli_si128(p1p0, 8);
+  q1 = _mm_srli_si128(q1q0, 8);
+
+  transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+                    &d2d3, &d4d5, &d6d7);
+
+  _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+  _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+  _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+  _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+  _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+  _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
+}
+
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+                              const unsigned char *_blimit,
+                              const unsigned char *_limit,
+                              const unsigned char *_thresh) {
+  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x6, x5, x4, x3;
+  __m128i pq0, pq1, pq2, pq3;
+  __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+  __m128i limit = _mm_load_si128((__m128i *)_limit);
+  __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+
+  transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+                       &q5p5, &q6p6, &q7p7);
+
+  lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+                       &limit, &thresh);
+
+  transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                           &q0p0, &pq0, &pq1, &pq2, &pq3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
+}
+
+void aom_lpf_vertical_14_dual_sse2(
+    unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+    const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+    const uint8_t *_thresh1) {
+  __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+  __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+  __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+  __m128i q0, q1, q2, q3, q7;
+  __m128i p0p1, p2p3, p4p5, p6p7;
+
+  __m128i blimit =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+                                     _mm_load_si128((const __m128i *)_limit1));
+  __m128i thresh =
+      _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+  x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+  x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+  x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+  x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+  x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+  x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+  x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+  transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+                          &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+  q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+  q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+  q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+  q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+  q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+  q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+  q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+  q7 = _mm_srli_si128(d14d15, 8);
+
+  lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+                            &blimit, &limit, &thresh);
+
+  x0 = _mm_srli_si128(q0p0, 8);
+  x1 = _mm_srli_si128(q1p1, 8);
+  x2 = _mm_srli_si128(q2p2, 8);
+  x3 = _mm_srli_si128(q3p3, 8);
+  x4 = _mm_srli_si128(q4p4, 8);
+  x5 = _mm_srli_si128(q5p5, 8);
+  x6 = _mm_srli_si128(q6p6, 8);
+
+  transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+                          &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+                          &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+  _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+  _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+  _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+  _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+  _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+  _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+  _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+                                       const __m128i *const a1,
+                                       const __m128i *const a2,
+                                       const __m128i *const s1,
+                                       const __m128i *const s2) {
+  __m128i x = _mm_add_epi16(*a1, *total);
+  x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+  return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+                                   const __m128i *const other_filt,
+                                   const __m128i *const f8_lo,
+                                   const __m128i *const f8_hi) {
+  const __m128i f8 =
+      _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+  const __m128i result = _mm_and_si128(*flat, f8);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+                                    const __m128i *const other_filt,
+                                    const __m128i *const f_lo,
+                                    const __m128i *const f_hi) {
+  const __m128i f =
+      _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+  const __m128i result = _mm_and_si128(*flat, f);
+  return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
+                                     const unsigned char *_blimit0,
+                                     const unsigned char *_limit0,
+                                     const unsigned char *_thresh0) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat, flat2;
+  __m128i p6, p5;
+  __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+  __m128i q6, q5;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+  p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+  p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+  q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+  q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+    flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+    flat2 = _mm_max_epu8(work, flat2);
+    flat2 = _mm_subs_epu8(flat2, one);
+    flat2 = _mm_cmpeq_epi8(flat2, zero);
+    flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+
+      // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+      // wide flat calculations
+      if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+        const __m128i eight = _mm_set1_epi16(8);
+        const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+        const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+        const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+        const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+        const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+        const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+
+        const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+        const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+        const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+        const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+        const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+        const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+
+        __m128i f_lo;
+        __m128i f_hi;
+
+        f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
+        f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
+        f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
+        f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+                             _mm_add_epi16(p2_lo, p1_lo));
+        f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+        f_lo = _mm_add_epi16(f_lo, eight);
+
+        f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
+        f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
+        f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
+        f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+                             _mm_add_epi16(p2_hi, p1_hi));
+        f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+        f_hi = _mm_add_epi16(f_hi, eight);
+
+        p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+        f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
+        p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+        f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
+        p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+        f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
+        op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+        f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
+        op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+        f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
+        op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
+        oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
+        oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
+        oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
+        q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
+        q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+        f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
+        f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
+        q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+        _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+      } else {
+        _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+        _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+        _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+        _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+        _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+        _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+      }
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  __m128i op2, op1, op0, oq0, oq1, oq2;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter8
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+      const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+      const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+                            _mm_add_epi16(p3_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+                            _mm_add_epi16(p2_lo, p1_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+                            _mm_add_epi16(p3_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+                            _mm_add_epi16(p2_hi, p1_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+      oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev, flat;
+  __m128i p2, p1, p0, q0, q1, q2;
+
+  __m128i op1, op0, oq0, oq1;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    __m128i work;
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  {
+    __m128i work;
+    work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+    flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    // filter6
+    if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+      const __m128i four = _mm_set1_epi16(4);
+      const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+      const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+      const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+      const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+      const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+      const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+
+      const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+      const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+      const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+      const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+      const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+      const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+      __m128i f8_lo, f8_hi;
+
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
+                            _mm_add_epi16(p2_lo, p2_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
+                            _mm_add_epi16(p1_lo, p0_lo));
+      f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
+                            _mm_add_epi16(p2_hi, p2_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
+                            _mm_add_epi16(p1_hi, p0_hi));
+      f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+      op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
+      op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
+      oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+      f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
+      f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
+      oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    } else {
+      _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+      _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+      _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+      _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+    }
+  }
+}
+
+void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
+                                    const unsigned char *_blimit0,
+                                    const unsigned char *_limit0,
+                                    const unsigned char *_thresh0) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+  const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+  const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+  __m128i mask, hev;
+  __m128i p1, p0, q0, q1;
+
+  __m128i op1, op0, oq0, oq1;
+
+  __m128i max_abs_p1p0q1q0;
+
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+
+  {
+    const __m128i abs_p1p0 = abs_diff(p1, p0);
+    const __m128i abs_q1q0 = abs_diff(q1, q0);
+    const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+    __m128i abs_p0q0 = abs_diff(p0, q0);
+    __m128i abs_p1q1 = abs_diff(p1, q1);
+    max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    mask = _mm_subs_epu8(mask, limit_v);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+    const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+    const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    op1 = _mm_xor_si128(p1, t80);
+    op0 = _mm_xor_si128(p0, t80);
+    oq0 = _mm_xor_si128(q0, t80);
+    oq1 = _mm_xor_si128(q1, t80);
+
+    hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+    filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+    work_a = _mm_subs_epi8(oq0, op0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_and_si128(filt, mask);
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+    oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+    op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_andnot_si128(hev, filt);
+    op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+    oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+    _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+  }
+}
+
+void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
+                                   const uint8_t *_blimit0,
+                                   const uint8_t *_limit0,
+                                   const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+  // Transpose 16x16
+  transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+  transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
+
+  // Transpose back
+  transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+  transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
+
+void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+  // Transpose 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+  // Transpose back
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+  // Transpose 16x8:: (wxh) 8x16 to 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+  // Transpose back:: (wxh) 16x8 to 8x16
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
+                                  const uint8_t *_blimit0,
+                                  const uint8_t *_limit0,
+                                  const uint8_t *_thresh0) {
+  DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+  // Transpose 16x8
+  transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+  // Loop filtering
+  aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
+                                 _thresh0);
+
+  // Transpose back
+  transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 0000000000..45464e80b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+                                            __m128i *x2, __m128i *x3,
+                                            __m128i *x4, __m128i *x5,
+                                            __m128i *d0, __m128i *d1,
+                                            __m128i *d2, __m128i *d3,
+                                            __m128i *d4, __m128i *d5) {
+  __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+  // 00 01 02 03 04 05 xx xx
+  // 10 11 12 13 14 15 xx xx
+  // 20 21 22 23 24 25 xx xx
+  // 30 31 32 33 34 35 xx xx
+  // 40 41 42 43 44 45 xx xx
+  // 50 51 52 53 54 55 xx xx
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
+  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
+  *d1 = _mm_unpackhi_epi64(ww0,
+                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  *d2 = _mm_unpacklo_epi64(ww0,
+                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
+
+  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
+  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
+  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
+
+  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
+
+  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
+  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
+  *d5 = _mm_unpackhi_epi64(ww0,
+                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                                    __m128i *x2, __m128i *x3,
+                                                    __m128i *d0, __m128i *d1,
+                                                    __m128i *d2, __m128i *d3) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+
+  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
+  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
+  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
+  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+                                                     __m128i *x2, __m128i *x3,
+                                                     __m128i *d4, __m128i *d5,
+                                                     __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, ww2, ww3;
+  __m128i zero = _mm_setzero_si128();
+
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+
+  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
+  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
+
+  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
+  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
+  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
+  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3,
+                                                __m128i *d4, __m128i *d5,
+                                                __m128i *d6, __m128i *d7) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // output
+  // 00 10 20 30 xx xx xx xx
+  // 01 11 21 31 xx xx xx xx
+  // 02 12 22 32 xx xx xx xx
+  // 03 13 23 33 xx xx xx xx
+  // 04 14 24 34 xx xx xx xx
+  // 05 15 25 35 xx xx xx xx
+  // 06 16 26 36 xx xx xx xx
+  // 07 17 27 37 xx xx xx xx
+  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
+}
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+                                                __m128i *x2, __m128i *x3,
+                                                __m128i *x4, __m128i *x5,
+                                                __m128i *x6, __m128i *x7,
+                                                __m128i *d0, __m128i *d1,
+                                                __m128i *d2, __m128i *d3) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+
+  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
+  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
+  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
+  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
+
+  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
+  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
+
+  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
+  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+                                                 __m128i *x2, __m128i *x3,
+                                                 __m128i *x4, __m128i *x5,
+                                                 __m128i *x6, __m128i *x7,
+                                                 __m128i *d4, __m128i *d5,
+                                                 __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, ww0, ww1;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5 50 51 52 53 54 55 56 57
+  // x6 60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
+  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
+  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
+  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
+
+  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
+  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
+
+  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
+  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
+
+  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
+  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
+
+  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
+  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+    __m128i *d7) {
+  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+                           d5, d6, d7);
+  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+                                             __m128i *x2, __m128i *x3,
+                                             __m128i *d0, __m128i *d1,
+                                             __m128i *d2, __m128i *d3) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  *d0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+  *d1 = _mm_srli_si128(*d0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(*d0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(*d0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *d0, __m128i *d1,
+                                         __m128i *d2, __m128i *d3, __m128i *d4,
+                                         __m128i *d5, __m128i *d6,
+                                         __m128i *d7) {
+  // input
+  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+  // output
+  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, ww0, ww1;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  ww0 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  ww1 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d1 = _mm_srli_si128(ww0,
+                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d2 = _mm_srli_si128(ww0,
+                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d3 = _mm_srli_si128(ww0,
+                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d5 = _mm_srli_si128(ww1,
+                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d6 = _mm_srli_si128(ww1,
+                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+  *d7 = _mm_srli_si128(ww1,
+                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                         __m128i *x3, __m128i *x4, __m128i *x5,
+                                         __m128i *x6, __m128i *x7, __m128i *d0,
+                                         __m128i *d1, __m128i *d2,
+                                         __m128i *d3) {
+  // input
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  // output
+  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+  __m128i w0, w1, w2, w3, w4, w5;
+
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d1 = _mm_srli_si128(*d0, 8);
+  *d2 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+  *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+                                     __m128i *x3, __m128i *x4, __m128i *x5,
+                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
+                                     __m128i *d2d3, __m128i *d4d5,
+                                     __m128i *d6d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+  // x0 00 01 02 03 04 05 06 07
+  // x1 10 11 12 13 14 15 16 17
+  w0 = _mm_unpacklo_epi8(
+      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+  // x2 20 21 22 23 24 25 26 27
+  // x3 30 31 32 33 34 35 36 37
+  w1 = _mm_unpacklo_epi8(
+      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+  // x4 40 41 42 43 44 45 46 47
+  // x5  50 51 52 53 54 55 56 57
+  w2 = _mm_unpacklo_epi8(
+      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+  // x6  60 61 62 63 64 65 66 67
+  // x7 70 71 72 73 74 75 76 77
+  w3 = _mm_unpacklo_epi8(
+      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+  w4 = _mm_unpacklo_epi16(
+      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+  w5 = _mm_unpacklo_epi16(
+      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+  *d0d1 = _mm_unpacklo_epi32(
+      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+  *d2d3 = _mm_unpackhi_epi32(
+      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+  w6 = _mm_unpackhi_epi16(
+      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+  w7 = _mm_unpackhi_epi16(
+      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+  *d4d5 = _mm_unpacklo_epi32(
+      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+  *d6d7 = _mm_unpackhi_epi32(
+      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpacklo_epi8(*x8, *x9);
+  w9 = _mm_unpacklo_epi8(*x10, *x11);
+  w10 = _mm_unpacklo_epi8(*x12, *x13);
+  w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0 = _mm_unpacklo_epi64(w6, w14);
+  *d1 = _mm_unpackhi_epi64(w6, w14);
+  *d2 = _mm_unpacklo_epi64(w7, w15);
+  *d3 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d4 = _mm_unpacklo_epi64(w6, w14);
+  *d5 = _mm_unpackhi_epi64(w6, w14);
+  *d6 = _mm_unpacklo_epi64(w7, w15);
+  *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+    __m128i *d12d13, __m128i *d14d15) {
+  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+  __m128i w10, w11, w12, w13, w14, w15;
+
+  w0 = _mm_unpacklo_epi8(*x0, *x1);
+  w1 = _mm_unpacklo_epi8(*x2, *x3);
+  w2 = _mm_unpacklo_epi8(*x4, *x5);
+  w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+  w8 = _mm_unpackhi_epi8(*x0, *x1);
+  w9 = _mm_unpackhi_epi8(*x2, *x3);
+  w10 = _mm_unpackhi_epi8(*x4, *x5);
+  w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+  w4 = _mm_unpacklo_epi16(w0, w1);
+  w5 = _mm_unpacklo_epi16(w2, w3);
+  w12 = _mm_unpacklo_epi16(w8, w9);
+  w13 = _mm_unpacklo_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store first 4-line result
+  *d0d1 = _mm_unpacklo_epi64(w6, w14);
+  *d2d3 = _mm_unpackhi_epi64(w6, w14);
+  *d4d5 = _mm_unpacklo_epi64(w7, w15);
+  *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+  w4 = _mm_unpackhi_epi16(w0, w1);
+  w5 = _mm_unpackhi_epi16(w2, w3);
+  w12 = _mm_unpackhi_epi16(w8, w9);
+  w13 = _mm_unpackhi_epi16(w10, w11);
+
+  w6 = _mm_unpacklo_epi32(w4, w5);
+  w7 = _mm_unpackhi_epi32(w4, w5);
+  w14 = _mm_unpacklo_epi32(w12, w13);
+  w15 = _mm_unpackhi_epi32(w12, w13);
+
+  // Store second 4-line result
+  *d8d9 = _mm_unpacklo_epi64(w6, w14);
+  *d10d11 = _mm_unpackhi_epi64(w6, w14);
+  *d12d13 = _mm_unpacklo_epi64(w7, w15);
+  *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
+                                  int in_p, unsigned char *out, int out_p) {
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+  x0 = _mm_loadl_epi64((__m128i *)in0);
+  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
+  x0 = _mm_unpacklo_epi8(x0, x1);
+
+  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
+  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
+  x1 = _mm_unpacklo_epi8(x2, x3);
+
+  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
+  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
+  x2 = _mm_unpacklo_epi8(x4, x5);
+
+  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
+  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
+  x3 = _mm_unpacklo_epi8(x6, x7);
+  x4 = _mm_unpacklo_epi16(x0, x1);
+
+  x8 = _mm_loadl_epi64((__m128i *)in1);
+  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
+  x8 = _mm_unpacklo_epi8(x8, x9);
+  x5 = _mm_unpacklo_epi16(x2, x3);
+
+  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
+  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
+  x9 = _mm_unpacklo_epi8(x10, x11);
+
+  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
+  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
+  x10 = _mm_unpacklo_epi8(x12, x13);
+  x12 = _mm_unpacklo_epi16(x8, x9);
+
+  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
+  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
+  x11 = _mm_unpacklo_epi8(x14, x15);
+  x13 = _mm_unpacklo_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store first 4-line result
+  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+  x4 = _mm_unpackhi_epi16(x0, x1);
+  x5 = _mm_unpackhi_epi16(x2, x3);
+  x12 = _mm_unpackhi_epi16(x8, x9);
+  x13 = _mm_unpackhi_epi16(x10, x11);
+
+  x6 = _mm_unpacklo_epi32(x4, x5);
+  x7 = _mm_unpackhi_epi32(x4, x5);
+  x14 = _mm_unpacklo_epi32(x12, x13);
+  x15 = _mm_unpackhi_epi32(x12, x13);
+
+  // Store second 4-line result
+  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
+                                          unsigned char *dst, int out_p) {
+  // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
+  // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
+  // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
+  // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
+  // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
+  // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
+  // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
+  // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
+  const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
+  const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
+  const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
+  const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
+  const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
+  const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
+  const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
+  const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
+
+  // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
+  // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
+  // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
+  // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
+  // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
+  // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
+  // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
+  // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
+  const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
+  const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
+  const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
+  const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
+  const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
+  const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
+  const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
+  const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
+
+  // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
+  // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
+  // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
+  // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
+  // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
+  // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
+  // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
+  // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
+  const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
+  const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
+  const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
+  const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
+  const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
+  const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
+  const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
+  const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
+
+  // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
+  // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
+  // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
+  // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
+  // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
+  // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
+  // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
+  // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
+  const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
+  const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
+  const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
+  const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
+  const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
+  const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
+  const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
+  const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
+
+  mm_storelu(dst, x_s30);
+  mm_storehu(dst + (1 * out_p), x_s30);
+  mm_storelu(dst + (2 * out_p), x_s31);
+  mm_storehu(dst + (3 * out_p), x_s31);
+  mm_storelu(dst + (4 * out_p), x_s32);
+  mm_storehu(dst + (5 * out_p), x_s32);
+  mm_storelu(dst + (6 * out_p), x_s33);
+  mm_storehu(dst + (7 * out_p), x_s33);
+  mm_storelu(dst + (8 * out_p), x_s34);
+  mm_storehu(dst + (9 * out_p), x_s34);
+  mm_storelu(dst + (10 * out_p), x_s35);
+  mm_storehu(dst + (11 * out_p), x_s35);
+  mm_storelu(dst + (12 * out_p), x_s36);
+  mm_storehu(dst + (13 * out_p), x_s36);
+  mm_storelu(dst + (14 * out_p), x_s37);
+  mm_storehu(dst + (15 * out_p), x_s37);
+}
+
+static INLINE void transpose_8xn(unsigned char *src[], int in_p,
+                                 unsigned char *dst[], int out_p,
+                                 int num_8x8_to_transpose) {
+  int idx8x8 = 0;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  do {
+    unsigned char *in = src[idx8x8];
+    unsigned char *out = dst[idx8x8];
+
+    x0 =
+        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
+    x1 =
+        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
+    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+    x0 = _mm_unpacklo_epi8(x0, x1);
+
+    x2 =
+        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
+    x3 =
+        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
+    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+    x1 = _mm_unpacklo_epi8(x2, x3);
+
+    x4 =
+        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
+    x5 =
+        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
+    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+    x2 = _mm_unpacklo_epi8(x4, x5);
+
+    x6 =
+        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
+    x7 =
+        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
+    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+    x3 = _mm_unpacklo_epi8(x6, x7);
+
+    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+    x4 = _mm_unpacklo_epi16(x0, x1);
+    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+    x5 = _mm_unpacklo_epi16(x2, x3);
+    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
+    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+    x7 = _mm_unpackhi_epi32(x4, x5);
+    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
+    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
+
+    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+    x4 = _mm_unpackhi_epi16(x0, x1);
+    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+    x5 = _mm_unpackhi_epi16(x2, x3);
+    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+    x6 = _mm_unpacklo_epi32(x4, x5);
+    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
+    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+    x7 = _mm_unpackhi_epi32(x4, x5);
+
+    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
+    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
+  } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+#endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 0000000000..799ce9ef44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx)                             \
+  a = _mm_loadu_si128((const __m128i *)&ref##idx[x]);         \
+  data_l = _mm_unpacklo_epi8(a, b);                           \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                       \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                 \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  data_r = _mm_unpackhi_epi8(a, b);                           \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                       \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                 \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+                                                              \
+  pred = _mm_packus_epi16(pred_l, pred_r);                    \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *a_ptr[4], int a_stride,
+                                       const uint8_t *b_ptr, int b_stride,
+                                       const uint8_t *m_ptr, int m_stride,
+                                       int width, int height, int inv_mask,
+                                       unsigned sad_array[4]) {
+  int x, y;
+  __m128i a;
+  __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const uint8_t *ref0 = a_ptr[0];
+  const uint8_t *ref1 = a_ptr[1];
+  const uint8_t *ref2 = a_ptr[2];
+  const uint8_t *ref3 = a_ptr[3];
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+      __m128i m = inv_mask ? m_inv : m_copy;
+      m_inv = inv_mask ? m_copy : m_inv;
+
+      MASK_SAD16XH_ONE_REF(0)
+      MASK_SAD16XH_ONE_REF(1)
+      MASK_SAD16XH_ONE_REF(2)
+      MASK_SAD16XH_ONE_REF(3)
+    }
+
+    src_ptr += src_stride;
+    ref0 += a_stride;
+    ref1 += a_stride;
+    ref2 += a_stride;
+    ref3 += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx)                                               \
+  const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx);              \
+  const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+  data_l = _mm_unpacklo_epi8(a##idx##0, b0);                                   \
+  mask_l = _mm_unpacklo_epi8(m, m_inv);                                        \
+  pred_l = _mm_maddubs_epi16(data_l, mask_l);                                  \
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  data_r = _mm_unpacklo_epi8(a##idx##1, b1);                                   \
+  mask_r = _mm_unpackhi_epi8(m, m_inv);                                        \
+  pred_r = _mm_maddubs_epi16(data_r, mask_r);                                  \
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);                  \
+                                                                               \
+  pred = _mm_packus_epi16(pred_l, pred_r);                                     \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[4], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[4]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+    const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+    const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+    const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+    __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD8XH_ONE_REF(0)
+    MASK_SAD8XH_ONE_REF(1)
+    MASK_SAD8XH_ONE_REF(2)
+    MASK_SAD8XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+                       _mm_unpackhi_epi32(res0, res1));
+  res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+                       _mm_unpackhi_epi32(res2, res3));
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx)                                          \
+  a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx),             \
+                         _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+  data = _mm_unpacklo_epi8(a, b);                                         \
+  mask = _mm_unpacklo_epi8(m, m_inv);                                     \
+  pred = _mm_maddubs_epi16(data, mask);                                   \
+  pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS);                 \
+                                                                          \
+  pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
+  res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_array[4], int a_stride,
+                                const uint8_t *b_ptr, int b_stride,
+                                const uint8_t *m_ptr, int m_stride, int height,
+                                int inv_mask, unsigned sad_array[4]) {
+  const uint8_t *ref0 = ref_array[0];
+  const uint8_t *ref1 = ref_array[1];
+  const uint8_t *ref2 = ref_array[2];
+  const uint8_t *ref3 = ref_array[3];
+  __m128i data, pred, mask;
+  __m128i res0 = _mm_setzero_si128();
+  __m128i res1 = _mm_setzero_si128();
+  __m128i res2 = _mm_setzero_si128();
+  __m128i res3 = _mm_setzero_si128();
+  __m128i a;
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (int y = 0; y < height; y += 2) {
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+    const __m128i m_copy =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+
+    __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+    __m128i m = inv_mask ? m_inv : m_copy;
+    m_inv = inv_mask ? m_copy : m_inv;
+
+    MASK_SAD4XH_ONE_REF(0)
+    MASK_SAD4XH_ONE_REF(1)
+    MASK_SAD4XH_ONE_REF(2)
+    MASK_SAD4XH_ONE_REF(3)
+
+    ref0 += 2 * a_stride;
+    ref1 += 2 * a_stride;
+    ref2 += 2 * a_stride;
+    ref3 += 2 * a_stride;
+    src_ptr += 2 * src_stride;
+    b_ptr += 2 * b_stride;
+    m_ptr += 2 * m_stride;
+  }
+  res0 = _mm_unpacklo_epi32(res0, res1);
+  res2 = _mm_unpacklo_epi32(res2, res3);
+  res0 = _mm_unpacklo_epi64(res0, res2);
+  _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n)                                                 \
+  void aom_masked_sad##m##x##n##x4d_ssse3(                                     \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],               \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,          \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                   \
+    masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+                        msk_stride, m, n, inv_mask, sad_array);                \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  void aom_masked_sad8x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
+    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  void aom_masked_sad4x##n##x4d_ssse3(                                        \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
+    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 0000000000..2c022555b5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 32) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+      const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+      __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+      pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+      const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+      __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+      pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+      const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+      res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return sad;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+  __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+  __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+  __m256i a = _mm256_castsi128_si256(a0);
+  return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_scale =
+      _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+    const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+    // Calculate 16 predicted pixels.
+    // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+    // is 64 * 255, so we have plenty of space to add rounding constants.
+    const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+    __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+    pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+    const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+    __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+    pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+    const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+    res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  res = _mm256_shuffle_epi32(res, 0xd8);
+  res = _mm256_permute4x64_epi64(res, 0xd8);
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int32_t sad = _mm256_extract_epi32(res, 0);
+  return sad;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+                                      second_pred, m, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+                                  m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+                                      ref_stride, msk, msk_stride, n);
+        break;
+      case 16:
+        sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+                                  ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n)                                                 \
+  unsigned int aom_masked_sad##m##x##n##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+                               msk, msk_stride, invert_mask, m, n);           \
+  }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+    const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+    const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+    // Zero-extend mask to 16 bits
+    const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)(m_ptr)),
+        _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+    const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+    const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+    const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+    __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+    pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+    const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+    __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+    pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                               AOM_BLEND_A64_ROUND_BITS);
+
+    // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+    // so it is safe to do signed saturation here.
+    const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+    // There is no 16-bit SAD instruction, so we have to synthesize
+    // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+    // and accumulating them at the end
+    const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+    res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+    src_ptr += src_stride << 1;
+    a_ptr += a_stride << 1;
+    b_ptr += b_stride << 1;
+    m_ptr += m_stride << 1;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return sad;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m256i res = _mm256_setzero_si256();
+  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i one = _mm256_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m256i m =
+          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+                                 AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm256_hadd_epi32(res, res);
+  res = _mm256_hadd_epi32(res, res);
+  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+  return sad;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+    const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+    int invert_mask, int m, int n) {
+  unsigned int sad;
+  if (!invert_mask) {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+                                           second_pred, m, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+                                        second_pred, m, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+                                         second_pred, m, msk, msk_stride, m, n);
+        break;
+    }
+  } else {
+    switch (m) {
+      case 4:
+        sad =
+            aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+                                           ref_stride, msk, msk_stride, n);
+        break;
+      case 8:
+        sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+                                        ref_stride, msk, msk_stride, n);
+        break;
+      default:
+        sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+                                         ref_stride, msk, msk_stride, m, n);
+        break;
+    }
+  }
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n)                                      \
+  unsigned int aom_highbd_masked_sad##m##x##n##_avx2(                     \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,           \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,    \
+      int msk_stride, int invert_mask) {                                  \
+    return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+                                      second_pred8, msk, msk_stride,      \
+                                      invert_mask, m, n);                 \
+  }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4)
+HIGHBD_MASKSADMXN_AVX2(4, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 16)
+HIGHBD_MASKSADMXN_AVX2(32, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 32)
+HIGHBD_MASKSADMXN_AVX2(64, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 128)
+HIGHBD_MASKSADMXN_AVX2(128, 64)
+HIGHBD_MASKSADMXN_AVX2(128, 128)
+HIGHBD_MASKSADMXN_AVX2(4, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 0000000000..df3a8764e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+// For width a multiple of 16
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+#define MASKSADMXN_SSSE3(m, n)                                                \
+  unsigned int aom_masked_sad##m##x##n##_ssse3(                               \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred,  \
+                              m, msk, msk_stride, m, n);                      \
+    else                                                                      \
+      return masked_sad_ssse3(src, src_stride, second_pred, m, ref,           \
+                              ref_stride, msk, msk_stride, m, n);             \
+  }
+
+#define MASKSAD8XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad8x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 8, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+#define MASKSAD4XN_SSSE3(n)                                                   \
+  unsigned int aom_masked_sad4x##n##_ssse3(                                   \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      const uint8_t *second_pred, const uint8_t *msk, int msk_stride,         \
+      int invert_mask) {                                                      \
+    if (!invert_mask)                                                         \
+      return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,        \
+                                     second_pred, 4, msk, msk_stride, n);     \
+    else                                                                      \
+      return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref,    \
+                                     ref_stride, msk, msk_stride, n);         \
+  }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+                                            int src_stride,
+                                            const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+      // Calculate 16 predicted pixels.
+      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+      // is 64 * 255, so we have plenty of space to add rounding constants.
+      const __m128i data_l = _mm_unpacklo_epi8(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi8(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+  unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+                                    _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+  return sad;
+}
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
+    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
+    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
+    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
+    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
+    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+                                    _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+  return sad;
+}
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height) {
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+  for (y = 0; y < height; y += 2) {
+    // Load two rows at a time, this seems to be a bit faster
+    // than four rows at a time in this case.
+    const __m128i src =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+                           _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+                           _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+                           _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+    const __m128i m =
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+                           _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+    const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+    const __m128i data = _mm_unpacklo_epi8(a, b);
+    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
+    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
+    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
+    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  // At this point, the SAD is stored in lane 0 of 'res'
+  return (unsigned int)_mm_cvtsi128_si32(res);
+}
+
+// For width a multiple of 8
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n)                                         \
+  unsigned int aom_highbd_masked_sad##m##x##n##_ssse3(                        \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,               \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,        \
+      int msk_stride, int invert_mask) {                                      \
+    if (!invert_mask)                                                         \
+      return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride,      \
+                                     second_pred8, m, msk, msk_stride, m, n); \
+    else                                                                      \
+      return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
+                                     ref_stride, msk, msk_stride, m, n);      \
+  }
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n)                                             \
+  unsigned int aom_highbd_masked_sad4x##n##_ssse3(                             \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
+      int ref_stride, const uint8_t *second_pred8, const uint8_t *msk,         \
+      int msk_stride, int invert_mask) {                                       \
+    if (!invert_mask)                                                          \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8,            \
+                                            ref_stride, second_pred8, 4, msk,  \
+                                            msk_stride, n);                    \
+    else                                                                       \
+      return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+                                            ref8, ref_stride, msk, msk_stride, \
+                                            n);                                \
+  }
+
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+HIGHBD_MASKSAD4XN_SSSE3(16)
+HIGHBD_MASKSADMXN_SSSE3(16, 4)
+HIGHBD_MASKSADMXN_SSSE3(8, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 8)
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+    int width, int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int x, y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      // Zero-extend mask to 16 bits
+      const __m128i m = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+      // so it is safe to do signed saturation here.
+      const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+      // There is no 16-bit SAD instruction, so we have to synthesize
+      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+      // and accumulating them at the end
+      const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+      res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // At this point, we have four 32-bit partial SADs stored in 'res'.
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return sad;
+}
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height) {
+  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+  int y;
+  __m128i res = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (y = 0; y < height; y += 2) {
+    const __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
+                           _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
+    const __m128i b =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
+                           _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
+    // Zero-extend mask to 16 bits
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+        _mm_setzero_si128());
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+    const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+    res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+
+    src_ptr += src_stride * 2;
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, res);
+  res = _mm_hadd_epi32(res, res);
+  int sad = _mm_cvtsi128_si32(res);
+  return sad;
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 0000000000..cffbd9672c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *a_ptr, int a_stride,
+                                     const uint8_t *b_ptr, int b_stride,
+                                     const uint8_t *m_ptr, int m_stride,
+                                     int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+                                            const uint8_t *a8, int a_stride,
+                                            const uint8_t *b8, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int height);
+
+#endif  // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 0000000000..0bf383fffd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+// For width a multiple of 16
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h);
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h);
+
+// For width a multiple of 16
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_);
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_);
+
+#define MASK_SUBPIX_VAR_SSSE3(W, H)                                   \
+  unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3(        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,   \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+      const uint8_t *msk, int msk_stride, int invert_mask,            \
+      unsigned int *sse) {                                            \
+    int sum;                                                          \
+    uint8_t temp[(H + 1) * W];                                        \
+                                                                      \
+    bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);   \
+                                                                      \
+    if (!invert_mask)                                                 \
+      masked_variance(ref, ref_stride, temp, W, second_pred, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    else                                                              \
+      masked_variance(ref, ref_stride, second_pred, W, temp, W, msk,  \
+                      msk_stride, W, H, sse, &sum);                   \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));         \
+  }
+
+#define MASK_SUBPIX_VAR8XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 8];                                                \
+                                                                              \
+    bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H));                 \
+  }
+
+#define MASK_SUBPIX_VAR4XH_SSSE3(H)                                           \
+  unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3(                    \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,         \
+      const uint8_t *msk, int msk_stride, int invert_mask,                    \
+      unsigned int *sse) {                                                    \
+    int sum;                                                                  \
+    uint8_t temp[(H + 1) * 4];                                                \
+                                                                              \
+    bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);           \
+                                                                              \
+    if (!invert_mask)                                                         \
+      masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    else                                                                      \
+      masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+                         H, sse, &sum);                                       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));                 \
+  }
+
+MASK_SUBPIX_VAR_SSSE3(128, 128)
+MASK_SUBPIX_VAR_SSSE3(128, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 128)
+MASK_SUBPIX_VAR_SSSE3(64, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 64)
+MASK_SUBPIX_VAR_SSSE3(32, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 32)
+MASK_SUBPIX_VAR_SSSE3(16, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 8)
+MASK_SUBPIX_VAR8XH_SSSE3(16)
+MASK_SUBPIX_VAR8XH_SSSE3(8)
+MASK_SUBPIX_VAR8XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(8)
+MASK_SUBPIX_VAR4XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(16)
+MASK_SUBPIX_VAR_SSSE3(16, 4)
+MASK_SUBPIX_VAR8XH_SSSE3(32)
+MASK_SUBPIX_VAR_SSSE3(32, 8)
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+
+static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+                                   const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi8(a, b);
+  v0 = _mm_maddubs_epi16(v0, filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi8(a, b);
+  v1 = _mm_maddubs_epi16(v1, filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+                            int yoffset, uint8_t *dst, int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        __m128i z = _mm_alignr_epi8(y, x, 1);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+        const __m128i z = _mm_alignr_epi8(y, x, 1);
+        const __m128i res = filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 16) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+                                         const __m128i *a1, const __m128i *b1,
+                                         const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+  v0 = _mm_maddubs_epi16(v0, *filter);
+  v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+  v1 = _mm_maddubs_epi16(v1, *filter);
+  v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+  return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 8;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 8;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+    const __m128i z0 = _mm_srli_si128(x0, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x0, z0);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
+      dst += 8;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
+      const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+                               int yoffset, uint8_t *dst, int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)src);
+      xx_storel_32(b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint8_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 1);
+      xx_storel_32(b, _mm_avg_epu8(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint8_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 1);
+      const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 1);
+      const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
+      const __m128i z2 = _mm_srli_si128(x2, 1);
+      const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
+      const __m128i z3 = _mm_srli_si128(x3, 1);
+
+      const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
+      const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
+      const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
+      const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 4;
+      b += 16;
+    }
+    // Handle i = h separately
+    const __m128i x = _mm_loadl_epi64((__m128i *)src);
+    const __m128i z = _mm_srli_si128(x, 1);
+
+    __m128i v0 = _mm_unpacklo_epi8(x, z);
+    v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+    xx_storel_32(b, _mm_packus_epi16(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = xx_loadl_32((__m128i *)dst);
+      __m128i y = xx_loadl_32((__m128i *)&dst[4]);
+      xx_storel_32(dst, _mm_avg_epu8(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+    for (i = 0; i < h; i += 4) {
+      const __m128i a = xx_loadl_32((__m128i *)dst);
+      const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
+      const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
+      const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
+      const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
+
+      const __m128i a0 = _mm_unpacklo_epi32(a, b);
+      const __m128i b0 = _mm_unpacklo_epi32(b, c);
+      const __m128i a1 = _mm_unpacklo_epi32(c, d);
+      const __m128i b1 = _mm_unpacklo_epi32(d, e);
+      const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 16;
+    }
+  }
+}
+
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+                                    const __m128i *b, const __m128i *m,
+                                    __m128i *sum, __m128i *sum_sq) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
+
+  // Calculate 16 predicted pixels.
+  // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+  // is 64 * 255, so we have plenty of space to add rounding constants.
+  const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+  const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
+  __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+  pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+  const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
+  __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+  pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+  const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
+  const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
+  const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
+
+  // Update partial sums and partial sums of squares
+  *sum =
+      _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
+  *sum_sq =
+      _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
+                                           _mm_madd_epi16(diff_r, diff_r)));
+}
+
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+                            const uint8_t *a_ptr, int a_stride,
+                            const uint8_t *b_ptr, int b_stride,
+                            const uint8_t *m_ptr, int m_stride, int width,
+                            int height, unsigned int *sse, int *sum_) {
+  int x, y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 16) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+      accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m =
+        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+                               const uint8_t *a_ptr, const uint8_t *b_ptr,
+                               const uint8_t *m_ptr, int m_stride, int height,
+                               unsigned int *sse, int *sum_) {
+  int y;
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 4) {
+    // Load four rows at a time
+    __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride],
+                                 *(int *)&src_ptr[src_stride * 2],
+                                 *(int *)&src_ptr[src_stride * 3]);
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride],
+                                     *(int *)&m_ptr[m_stride * 2],
+                                     *(int *)&m_ptr[m_stride * 3]);
+    accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+    src_ptr += src_stride * 4;
+    a_ptr += 16;
+    b_ptr += 16;
+    m_ptr += m_stride * 4;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, sum);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// For width a multiple of 8
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h);
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h);
+
+// For width a multiple of 8
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_);
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_);
+
+#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3(     \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)sse64;                                                 \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(    \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    uint64_t sse64;                                                         \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * W];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    else                                                                    \
+      highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+                             msk_stride, W, H, &sse64, &sum);               \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8);                          \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H)                                  \
+  unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3(         \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)sse_;                                                  \
+    return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H));               \
+  }                                                                         \
+  unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 2);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }                                                                         \
+  unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3(        \
+      const uint8_t *src8, int src_stride, int xoffset, int yoffset,        \
+      const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8,     \
+      const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+    int sse_;                                                               \
+    int sum;                                                                \
+    int64_t var;                                                            \
+    uint16_t temp[(H + 1) * 4];                                             \
+    const uint16_t *src = CONVERT_TO_SHORTPTR(src8);                        \
+    const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                        \
+    const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8);        \
+                                                                            \
+    highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H);  \
+                                                                            \
+    if (!invert_mask)                                                       \
+      highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    else                                                                    \
+      highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk,    \
+                                msk_stride, H, &sse_, &sum);                \
+    *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8);                           \
+    sum = ROUND_POWER_OF_TWO(sum, 4);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H));               \
+    return (var >= 0) ? (uint32_t)var : 0;                                  \
+  }
+
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+
+static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+                                          const __m128i filter) {
+  __m128i v0 = _mm_unpacklo_epi16(a, b);
+  v0 = _mm_madd_epi16(v0, filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpackhi_epi16(a, b);
+  v1 = _mm_madd_epi16(v1, filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+                                   int xoffset, int yoffset, uint16_t *dst,
+                                   int w, int h) {
+  int i, j;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        _mm_storeu_si128((__m128i *)&b[j], x);
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        __m128i z = _mm_alignr_epi8(y, x, 2);
+        _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
+      }
+      src += src_stride;
+      b += w;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h + 1; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+        const __m128i z = _mm_alignr_epi8(y, x, 2);
+        const __m128i res = highbd_filter_block(x, z, hfilter_vec);
+        _mm_storeu_si128((__m128i *)&b[j], res);
+      }
+
+      src += src_stride;
+      b += w;
+    }
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
+      }
+      dst += w;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; j += 8) {
+        const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+        const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+        const __m128i res = highbd_filter_block(x, y, vfilter_vec);
+        _mm_storeu_si128((__m128i *)&dst[j], res);
+      }
+
+      dst += w;
+    }
+  }
+}
+
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+                                                const __m128i *b0,
+                                                const __m128i *a1,
+                                                const __m128i *b1,
+                                                const __m128i *filter) {
+  __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+  v0 = _mm_madd_epi16(v0, *filter);
+  v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+  __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+  v1 = _mm_madd_epi16(v1, *filter);
+  v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+  return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+                                      int xoffset, int yoffset, uint16_t *dst,
+                                      int h) {
+  int i;
+  // Horizontal filter
+  if (xoffset == 0) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)src);
+      _mm_storel_epi64((__m128i *)b, x);
+      src += src_stride;
+      b += 4;
+    }
+  } else if (xoffset == 4) {
+    uint16_t *b = dst;
+    for (i = 0; i < h + 1; ++i) {
+      __m128i x = _mm_loadu_si128((__m128i *)src);
+      __m128i z = _mm_srli_si128(x, 2);
+      _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
+      src += src_stride;
+      b += 4;
+    }
+  } else {
+    uint16_t *b = dst;
+    const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+    const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+      const __m128i z0 = _mm_srli_si128(x0, 2);
+      const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+      const __m128i z1 = _mm_srli_si128(x1, 2);
+      const __m128i res =
+          highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+      _mm_storeu_si128((__m128i *)b, res);
+
+      src += src_stride * 2;
+      b += 8;
+    }
+    // Process i = h separately
+    __m128i x = _mm_loadu_si128((__m128i *)src);
+    __m128i z = _mm_srli_si128(x, 2);
+
+    __m128i v0 = _mm_unpacklo_epi16(x, z);
+    v0 = _mm_madd_epi16(v0, hfilter_vec);
+    v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+    _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
+  }
+
+  // Vertical filter
+  if (yoffset == 0) {
+    // The data is already in 'dst', so no need to filter
+  } else if (yoffset == 4) {
+    for (i = 0; i < h; ++i) {
+      __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
+      dst += 4;
+    }
+  } else {
+    const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+    const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+    for (i = 0; i < h; i += 2) {
+      const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+      const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+      const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
+      const __m128i res =
+          highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+      _mm_storeu_si128((__m128i *)dst, res);
+
+      dst += 8;
+    }
+  }
+}
+
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+                                   const uint16_t *a_ptr, int a_stride,
+                                   const uint16_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int width, int height, uint64_t *sse,
+                                   int *sum_) {
+  int x, y;
+  // Note on bit widths:
+  // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
+  // so this can be kept as four 32-bit values.
+  // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
+  // so this must be stored as two 64-bit values.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x += 8) {
+      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+      const __m128i m =
+          _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
+      const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+      // Calculate 8 predicted pixels.
+      const __m128i data_l = _mm_unpacklo_epi16(a, b);
+      const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+      __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+      pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i data_r = _mm_unpackhi_epi16(a, b);
+      const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+      __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+      pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                              AOM_BLEND_A64_ROUND_BITS);
+
+      const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+      const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+      __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+      __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+      // Update partial sums and partial sums of squares
+      sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+      // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
+      // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
+      // So we can re-pack into 16-bit fields and use _mm_madd_epi16
+      // to calculate the squares and partially sum them.
+      const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+      const __m128i prod = _mm_madd_epi16(tmp, tmp);
+      // Then we want to sign-extend to 64 bits and accumulate
+      const __m128i sign = _mm_srai_epi32(prod, 31);
+      const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
+      const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
+      sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
+    }
+
+    src_ptr += src_stride;
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, zero);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
+  _mm_storel_epi64((__m128i *)sse, sum_sq);
+}
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+                                      const uint16_t *a_ptr,
+                                      const uint16_t *b_ptr,
+                                      const uint8_t *m_ptr, int m_stride,
+                                      int height, int *sse, int *sum_) {
+  int y;
+  // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
+  // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
+  // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
+  // So we can safely pack sum_sq into 32-bit fields, which is slightly more
+  // convenient.
+  __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+  const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  for (y = 0; y < height; y += 2) {
+    __m128i src = _mm_unpacklo_epi64(
+        _mm_loadl_epi64((const __m128i *)src_ptr),
+        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+    const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+    const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+    const __m128i m = _mm_unpacklo_epi8(
+        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+                           _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+        zero);
+    const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+    const __m128i data_l = _mm_unpacklo_epi16(a, b);
+    const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+    __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+    pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i data_r = _mm_unpackhi_epi16(a, b);
+    const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+    __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+    pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+                            AOM_BLEND_A64_ROUND_BITS);
+
+    const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+    const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+    __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+    __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+    // Update partial sums and partial sums of squares
+    sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+    const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+    const __m128i prod = _mm_madd_epi16(tmp, tmp);
+    sum_sq = _mm_add_epi32(sum_sq, prod);
+
+    src_ptr += src_stride * 2;
+    a_ptr += 8;
+    b_ptr += 8;
+    m_ptr += m_stride * 2;
+  }
+  // Reduce down to a single sum and sum of squares
+  sum = _mm_hadd_epi32(sum, sum_sq);
+  sum = _mm_hadd_epi32(sum, zero);
+  *sum_ = _mm_cvtsi128_si32(sum);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+                              int width, int height, const uint8_t *ref,
+                              int ref_stride, const uint8_t *mask,
+                              int mask_stride, int invert_mask) {
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  assert(height % 2 == 0);
+  int i = 0;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+      comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+                              mask + mask_stride, comp_pred + width);
+      comp_pred += (width << 1);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      i += 2;
+    } while (i < height);
+  } else {
+    do {
+      for (int x = 0; x < width; x += 32) {
+        comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
+        comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
+                                comp_pred + 16);
+        comp_pred += 32;
+      }
+      src0 += (stride0);
+      src1 += (stride1);
+      mask += (mask_stride);
+      i += 1;
+    } while (i < height);
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 0000000000..4faa098ace
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+                                           const uint8_t *src1,
+                                           const uint8_t *mask, uint8_t *dst) {
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+  const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+  const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+  const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+  const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+  const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+  const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+  const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+  const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+  const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+  const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+  const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+  const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+  _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+                                          const uint8_t *src0, int stride0,
+                                          const uint8_t *src1, int stride1,
+                                          const uint8_t *mask,
+                                          int mask_stride) {
+  int i = 0;
+  const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const __m128i round_offset =
+      _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+  do {
+    // odd line A
+    const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+    const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+    const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+    // even line B
+    const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+    const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+    const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+        _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+    const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+    const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+    const __m128i ma = _mm_sub_epi8(alpha_max, a);
+    const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+    const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+    const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+    const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+    const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+    const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+    const __m128i round = _mm_packus_epi16(roundA, roundB);
+    // comp_pred's stride == width == 8
+    _mm_store_si128((__m128i *)(comp_pred), round);
+    comp_pred += (8 << 1);
+    src0 += (stride0 << 1);
+    src1 += (stride1 << 1);
+    mask += (mask_stride << 1);
+    i += 2;
+  } while (i < height);
+}
+
+#endif  // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..085a572cb1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE int16_t loadu_int16(const void *src) {
+  int16_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE int64_t loadu_int64(const void *src) {
+  int64_t v;
+  memcpy(&v, src, sizeof(v));
+  return v;
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+  return _mm_castps_si128(
+      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
+                        loadu_int32((int8_t *)src + 1 * byte_stride),
+                        loadu_int32((int8_t *)src + 2 * byte_stride),
+                        loadu_int32((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+                                                  const int byte_stride) {
+  __m128i dst;
+  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+  return dst;
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+                                            uint8_t *const d,
+                                            const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+                                       const ptrdiff_t stride) {
+  __m128i ss[4];
+
+  ss[0] = s;
+  ss[1] = _mm_srli_si128(s, 4);
+  ss[2] = _mm_srli_si128(s, 8);
+  ss[3] = _mm_srli_si128(s, 12);
+  store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+                                 __m128i *const d) {
+  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+                                  const ptrdiff_t stride, __m128i *const d) {
+  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+                                   const ptrdiff_t stride, __m128i *const d) {
+  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+                                  const ptrdiff_t stride) {
+  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+                                    const ptrdiff_t stride) {
+  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 0000000000..210f466b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+                                    const int32_t *wsrc, const int32_t *mask,
+                                    unsigned int *const sse, int *const sum,
+                                    const int h) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
+    const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+    const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
new file mode 100644
index 0000000000..27398ffd62
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if AOM_ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+  const __m128i v_tmp_d =
+      _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 0000000000..9d1b7d4968
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int height) {
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  do {
+    const __m128i v_p_b_0 = xx_loadl_32(pre);
+    const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+    const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_b = xx_loadl_64(pre + n);
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if ((n & (width - 1)) == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h)                                          \
+  unsigned int aom_obmc_sad##w##x##h##_avx2(                      \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *msk) {                                       \
+    if (w == 4) {                                                 \
+      return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h);     \
+    } else {                                                      \
+      return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+    }                                                             \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+                                                const int pre_stride,
+                                                const int32_t *wsrc,
+                                                const int32_t *mask,
+                                                const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  do {
+    const __m128i v_p_w_0 = xx_loadl_64(pre);
+    const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+    const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+    const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+    const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+    const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+    const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+    n += 8;
+
+    pre += pre_stride << 1;
+  } while (n < 8 * (height >> 1));
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m256i v_sad_d = _mm256_setzero_si256();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+    const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+    const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+    const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+    const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+    const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+    // Rounded absolute difference
+    const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+    const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+    v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+  __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+  v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+  return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                           \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_avx2(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask) {                                           \
+    if (w == 4) {                                                      \
+      return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                           \
+      return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                                  \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 0000000000..542572c761
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                                 const int pre_stride,
+                                                 const int32_t *wsrc,
+                                                 const int32_t *mask,
+                                                 const int height) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+    const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h)                                       \
+  unsigned int aom_obmc_sad##w##x##h##_sse4_1(                 \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+      const int32_t *msk) {                                    \
+    if (w == 4) {                                              \
+      return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);       \
+    } else {                                                   \
+      return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);   \
+    }                                                          \
+  }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                                     const int pre_stride,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *mask,
+                                                     const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, const int width, const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                      \
+  unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1(             \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,    \
+      const int32_t *mask) {                                      \
+    if (w == 4) {                                                 \
+      return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);     \
+    } else {                                                      \
+      return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+    }                                                             \
+  }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 0000000000..c23d8c4eb0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  int n = 0, width, height = h;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m128i v_d;
+  const uint8_t *pre_temp;
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+      const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+      // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+      // boundaries. We use pmaddwd, as it has lower latency on Haswell
+      // than pmulld but produces the same result with these inputs.
+      const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+      const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_tmp_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+      const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+      const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+      const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+      const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 8;
+      n += 8;
+      width -= 8;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+  v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  *sum = _mm_cvtsi128_si32(v_d);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+                                      const int32_t *wsrc, const int32_t *mask,
+                                      unsigned int *const sse, int *const sum,
+                                      const int w, const int h) {
+  int n = 0, width, height = h;
+  __m256i v_d;
+  __m128i res0;
+  const uint8_t *pre_temp;
+  const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+  __m256i v_sum_d = _mm256_setzero_si256();
+  __m256i v_sse_d = _mm256_setzero_si256();
+
+  assert(w >= 16);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+  do {
+    width = w;
+    pre_temp = pre;
+    do {
+      const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+      const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+      const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+      const __m256i v_m1_d =
+          _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+      const __m256i v_w1_d =
+          _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+      const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+      const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+      const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+      const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+      const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+      const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+      const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+      const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+      const __m256i v_tmp0_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+      const __m256i v_tmp1_d =
+          _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+      const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+      const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+      const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+      const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+      v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+      v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+      pre_temp += 16;
+      n += 16;
+      width -= 16;
+    } while (width > 0);
+    pre += pre_stride;
+    height -= 1;
+  } while (height > 0);
+
+  v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+  v_d = _mm256_hadd_epi32(v_d, v_d);
+  res0 = _mm256_castsi256_si128(v_d);
+  res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+  *sum = _mm_cvtsi128_si32(res0);
+  *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H)                                                \
+  unsigned int aom_obmc_variance##W##x##H##_avx2(                       \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,          \
+      const int32_t *mask, unsigned int *sse) {                         \
+    int sum;                                                            \
+    if (W == 4) {                                                       \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);      \
+    } else if (W == 8) {                                                \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);  \
+    } else {                                                            \
+      obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                   \
+                                                                        \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));       \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 0000000000..89b050eb20
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter);
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+                                     const int32_t *wsrc, const int32_t *mask,
+                                     unsigned int *const sse, int *const sum,
+                                     const int w, const int h) {
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H)                                               \
+  unsigned int aom_obmc_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,         \
+      const int32_t *mask, unsigned int *sse) {                        \
+    int sum;                                                           \
+    if (W == 4) {                                                      \
+      obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);     \
+    } else {                                                           \
+      obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+    }                                                                  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));      \
+  }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H)                                                \
+  uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1(                    \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,          \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {         \
+    uint16_t fdata3[(H + 1) * W];                                            \
+    uint8_t temp2[H * W];                                                    \
+                                                                             \
+    aom_var_filter_block2d_bil_first_pass_ssse3(                             \
+        pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+    aom_var_filter_block2d_bil_second_pass_ssse3(                            \
+        fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]);            \
+                                                                             \
+    return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse);   \
+  }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+    const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+    const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+    const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum += xx_hsum_epi32_si64(v_sum_d);
+  *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask, int w, int h,
+                                          unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w < 128 || h < 128) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    assert(w == 128 && h == 128);
+
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * w;
+      mask += 64 * w;
+      h -= 64;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask, int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  int max_pel_allowed_per_ovf = 512;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else if (w * h <= max_pel_allowed_per_ovf) {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  } else {
+    int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+    assert(max_pel_allowed_per_ovf % w == 0);
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+                            h_per_ovf);
+      pre8 += h_per_ovf * pre_stride;
+      wsrc += h_per_ovf * w;
+      mask += h_per_ovf * w;
+      h -= h_per_ovf;
+    } while (h > 0);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H)                                               \
+  unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1(               \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H));          \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }                                                                        \
+                                                                           \
+  unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1(              \
+      const uint8_t *pre, int pre_stride, const int32_t *wsrc,             \
+      const int32_t *mask, unsigned int *sse) {                            \
+    int sum;                                                               \
+    int64_t var;                                                           \
+    highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H));              \
+    return (var >= 0) ? (uint32_t)var : 0;                                 \
+  }
+
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+HBD_OBMCVARWXH(4, 16)
+HBD_OBMCVARWXH(16, 4)
+HBD_OBMCVARWXH(8, 32)
+HBD_OBMCVARWXH(32, 8)
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx2.c b/third_party/aom/aom_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..b808d46778
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_avx2.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+                                      const int16_t *round_ptr, __m256i *round,
+                                      const int16_t *quant_ptr, __m256i *quant,
+                                      const int16_t *dequant_ptr,
+                                      __m256i *dequant,
+                                      const int16_t *shift_ptr, __m256i *shift,
+                                      int log_scale) {
+  *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+  *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *zbin = _mm256_add_epi16(*zbin, rnd);
+    *zbin = _mm256_srai_epi16(*zbin, log_scale);
+  }
+  // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+  // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+  *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+  *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+  *round = _mm256_permute4x64_epi64(*round, 0x54);
+  if (log_scale > 0) {
+    const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+    *round = _mm256_add_epi16(*round, rnd);
+    *round = _mm256_srai_epi16(*round, log_scale);
+  }
+
+  *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+  *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+  *dequant =
+      _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+  *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+  *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+  *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+  const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+  const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+  return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+                                           tran_low_t *coeff_ptr) {
+  __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+  __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+  __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+  _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+  _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+    __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+    return _mm256_setzero_si256();
+  }
+
+  // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const __m256i v_tmp_rnd =
+      _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+  //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+  //                 quant_shift_ptr[rc != 0]) >>
+  //                (16 - log_scale + AOM_QM_BITS));
+  const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+  const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+  const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+  const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+  const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+  const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+  store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+  store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+  return v_nz_mask;
+}
+
+static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
+                                       __m256i v_mask) {
+  const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+  const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+  const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+  const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+  return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+  const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+  const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+  __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+  __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan,
+                         const int16_t *iscan) {
+  (void)scan;
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, 0);
+
+  // Do DC and first 15 AC.
+  __m256i v_nz_mask =
+      quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                              &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask =
+        quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+                                &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale_16(
+    const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+    __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift, int log_scale) {
+  const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+  const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+  const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+  if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+    _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+    _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+    return _mm256_setzero_si256();
+  }
+
+  // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+  const __m256i v_tmp_rnd =
+      _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+  //  tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+  //                 quant_shift_ptr[rc != 0]) >>
+  //                (16 - log_scale + AOM_QM_BITS));
+  const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+  const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+  const __m256i v_tmp32_hi = _mm256_slli_epi16(
+      _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), log_scale);
+  const __m256i v_tmp32_lo = _mm256_srli_epi16(
+      _mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 16 - log_scale);
+  const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+  const __m256i v_dqcoeff_hi = _mm256_slli_epi16(
+      _mm256_mulhi_epi16(v_tmp32, *v_dequant), 16 - log_scale);
+  const __m256i v_dqcoeff_lo =
+      _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32, *v_dequant), log_scale);
+  const __m256i v_dqcoeff =
+      _mm256_sign_epi16(_mm256_or_si256(v_dqcoeff_hi, v_dqcoeff_lo), v_coeff);
+  const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+  const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+  store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+  store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+  return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *iscan, int log_scale) {
+  __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+  __m256i v_eobmax = _mm256_setzero_si256();
+
+  load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+                     &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+                     &v_quant_shift, log_scale);
+
+  // Do DC and first 15 AC.
+  __m256i v_nz_mask = quantize_b_logscale_16(
+      coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round,
+      &v_zbin, &v_quant_shift, log_scale);
+
+  v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+  v_round = _mm256_unpackhi_epi64(v_round, v_round);
+  v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+  v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+  v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+  v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+  for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+    coeff_ptr += 16;
+    qcoeff_ptr += 16;
+    dqcoeff_ptr += 16;
+    iscan += 16;
+    v_nz_mask = quantize_b_logscale_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+                                       &v_quant, &v_dequant, &v_round, &v_zbin,
+                                       &v_quant_shift, log_scale);
+
+    v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+  }
+
+  *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+void aom_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1);
+}
+
+void aom_quantize_b_64x64_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                               const int16_t *zbin_ptr,
+                               const int16_t *round_ptr,
+                               const int16_t *quant_ptr,
+                               const int16_t *quant_shift_ptr,
+                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                               const int16_t *scan, const int16_t *iscan) {
+  (void)scan;
+  quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+                             quant_ptr, quant_shift_ptr, qcoeff_ptr,
+                             dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..ebef1fbac2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
+                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                         uint16_t *eob_ptr, const int16_t *scan_ptr,
+                         const int16_t *iscan_ptr) {
+  const __m128i zero = _mm_setzero_si128();
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan_ptr;
+
+  // Setup global values.
+  load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+                dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  // Poor man's abs().
+  coeff0_sign = _mm_srai_epi16(coeff0, 15);
+  coeff1_sign = _mm_srai_epi16(coeff1, 15);
+  qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);  // Switch DC to AC
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+  calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+  round = _mm_unpackhi_epi64(round, round);
+  quant = _mm_unpackhi_epi64(quant, quant);
+  shift = _mm_unpackhi_epi64(shift, shift);
+
+  calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+  // Reinsert signs
+  qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+  qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+  // Mask out zbin threshold coeffs
+  qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+  qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+  store_coefficients(qcoeff0, qcoeff_ptr);
+  store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+  coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+  dequant = _mm_unpackhi_epi64(dequant, dequant);
+  coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+  store_coefficients(coeff0, dqcoeff_ptr);
+  store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+  eob =
+      scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    calculate_qcoeff(&qcoeff0, round, quant, shift);
+    calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+    coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+    store_coefficients(coeff0, dqcoeff_ptr + index);
+    store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+                        index, zero);
+    eob = _mm_max_epi16(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3.c b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..25980a055a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+                                          const __m128i quant,
+                                          const __m128i *shift) {
+  __m128i tmp, qcoeff, tmp1;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, 14);
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, 2);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+                                                     const __m128i dequant,
+                                                     const __m128i zero,
+                                                     tran_low_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+  const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  // "Divide" by 4.
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+  dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                                const int16_t *zbin_ptr,
+                                const int16_t *round_ptr,
+                                const int16_t *quant_ptr,
+                                const int16_t *quant_shift_ptr,
+                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                                const int16_t *scan, const int16_t *iscan) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i two = _mm_set1_epi16(2);
+  int index;
+
+  __m128i zbin, round, quant, dequant, shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1, all_zero;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  // Setup global values.
+  zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  round = _mm_load_si128((const __m128i *)round_ptr);
+  quant = _mm_load_si128((const __m128i *)quant_ptr);
+  dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+  // Shift with rounding.
+  zbin = _mm_add_epi16(zbin, two);
+  round = _mm_add_epi16(round, two);
+  zbin = _mm_srli_epi16(zbin, 2);
+  round = _mm_srli_epi16(round, 2);
+  zbin = _mm_sub_epi16(zbin, one);
+  // Do DC and first 15 AC.
+  coeff0 = load_coefficients(coeff_ptr);
+  coeff1 = load_coefficients(coeff_ptr + 8);
+
+  qcoeff0 = _mm_abs_epi16(coeff0);
+  qcoeff1 = _mm_abs_epi16(coeff1);
+
+  cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+  zbin = _mm_unpackhi_epi64(zbin, zbin);
+  cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+  all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+  if (_mm_movemask_epi8(all_zero) == 0) {
+    _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+    _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+  } else {
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+    shift = _mm_unpackhi_epi64(shift, shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    // Reinsert signs.
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    // Mask out zbin threshold coeffs.
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr);
+    store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+    eob =
+        scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  }
+
+  // AC only loop.
+  for (index = 16; index < 1024; index += 16) {
+    coeff0 = load_coefficients(coeff_ptr + index);
+    coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+    qcoeff0 = _mm_abs_epi16(coeff0);
+    qcoeff1 = _mm_abs_epi16(coeff1);
+
+    cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+    cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+    all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+    if (_mm_movemask_epi8(all_zero) == 0) {
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+      _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+      continue;
+    }
+    calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+    calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+    qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+    qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+    qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+    qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+    store_coefficients(qcoeff0, qcoeff_ptr + index);
+    store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+    calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+                                      dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+                                      dqcoeff_ptr + 8 + index);
+
+    eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = _mm_max_epi16(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..fa616a6f1a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,302 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+                                shift, qcoeff, dqcoeff, dequant, \
+                                eob, scan, iscan
+
+  ; actual quantize loop - setup pointers, rounders, etc.
+  movifnidn                   coeffq, coeffmp
+  movifnidn                  ncoeffq, ncoeffmp
+  movifnidn                    zbinq, zbinmp
+  movifnidn                   roundq, roundmp
+  movifnidn                   quantq, quantmp
+  movifnidn                 dequantq, dequantmp
+  mova                            m0, [zbinq]              ; m0 = zbin
+  mova                            m1, [roundq]             ; m1 = round
+  mova                            m2, [quantq]             ; m2 = quant
+%ifidn %1, b_32x32
+  pcmpeqw                         m5, m5
+  psrlw                           m5, 15
+  paddw                           m0, m5
+  paddw                           m1, m5
+  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
+  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
+%endif
+  mova                            m3, [dequantq]           ; m3 = dequant
+  mov                             r2, shiftmp
+  psubw                           m0, [GLOBAL(pw_1)]
+  mova                            m4, [r2]                 ; m4 = shift
+  mov                             r3, qcoeffmp
+  mov                             r4, dqcoeffmp
+  mov                             r5, iscanmp
+  pxor                            m5, m5                   ; m5 = dedicated zero
+  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+  lea                         coeffq, [  coeffq+ncoeffq*4]
+  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
+  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
+  lea                         iscanq, [  iscanq+ncoeffq*2]
+  neg                        ncoeffq
+
+  ; get DC and first 15 AC coeffs
+  ; coeff stored as 32bit numbers & require 16bit numbers
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  punpckhqdq                      m0, m0
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+  paddsw                          m6, m1                   ; m6 += round
+  punpckhqdq                      m1, m1
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
+  punpckhqdq                      m2, m2
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                           m8, m6                   ; m8 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
+  %endif
+  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                           m8, 1
+  psrlw                           m5, 15
+  por                             m8, m5
+  %endif
+  punpckhqdq                      m4, m4
+  %ifidn %1, b_32x32
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                          m8, m9                   ; m8 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                            m8, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  mova                           m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                           m8, m8
+  pabsw                          m13, m13
+%endif
+  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
+  punpckhqdq                      m3, m3
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                           m8, 1
+  psrlw                          m13, 1
+  psignw                          m8, m9
+  psignw                         m13, m10
+%endif
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  mova                            m11, m8
+  mova                            m6, m8
+  pcmpgtw                         m5, m8
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                           m8, m6                   ; m8 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jz .accumulate_eob
+
+.ac_only_loop:
+  ; pack coeff from 32bit to 16bit array
+  mova                            m9, [  coeffq+ncoeffq*4+ 0]
+  packssdw                        m9, [  coeffq+ncoeffq*4+16]
+  mova                           m10, [  coeffq+ncoeffq*4+32]
+  packssdw                       m10, [  coeffq+ncoeffq*4+48]
+
+  pabsw                           m6, m9                   ; m6 = abs(m9)
+  pabsw                          m11, m10                  ; m11 = abs(m10)
+  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
+  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+  pmovmskb                       r6d, m7
+  pmovmskb                       r2d, m12
+  or                              r6, r2
+  jz .skip_iter
+%endif
+  paddsw                          m6, m1                   ; m6 += round
+  paddsw                         m11, m1                   ; m11 += round
+  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
+  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
+  paddw                          m14, m6                   ; m14 += m6
+  paddw                          m13, m11                  ; m13 += m11
+  %ifidn %1, b_32x32
+  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
+  %endif
+  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m14, 1
+  psrlw                           m5, 15
+  por                            m14, m5
+  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
+  %endif
+  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
+  %ifidn %1, b_32x32
+  psllw                          m13, 1
+  psrlw                           m5, 15
+  por                            m13, m5
+  pxor                            m5, m5                   ; reset m5 to zero register
+  %endif
+  psignw                         m14, m9                   ; m14 = reinsert sign
+  psignw                         m13, m10                  ; m13 = reinsert sign
+  pand                           m14, m7
+  pand                           m13, m12
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+  pxor                           m11, m11
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+ 0], m11
+  mova        [qcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova        [qcoeffq+ncoeffq*4+32], m11
+  mova        [qcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5             ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+  pabsw                          m14, m14
+  pabsw                          m13, m13
+%endif
+  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
+  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+  psrlw                          m14, 1
+  psrlw                          m13, 1
+  psignw                         m14, m9
+  psignw                         m13, m10
+%endif
+
+  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+  mova                           m11, m14
+  mova                            m6, m14
+  pcmpgtw                         m5, m14
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m11
+  mova       [dqcoeffq+ncoeffq*4+16], m6
+  pxor                            m5, m5
+  mova                           m11, m13
+  mova                            m6, m13
+  pcmpgtw                         m5, m13
+  punpcklwd                      m11, m5
+  punpckhwd                       m6, m5
+  mova       [dqcoeffq+ncoeffq*4+32], m11
+  mova       [dqcoeffq+ncoeffq*4+48], m6
+  pxor                            m5, m5
+
+  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
+  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
+  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
+  psubw                           m6, m7                   ; m6 = scan[i] + 1
+  psubw                          m11, m12                  ; m11 = scan[i] + 1
+  pandn                          m14, m6                   ; m14 = max(eob)
+  pandn                          m13, m11                  ; m13 = max(eob)
+  pmaxsw                          m8, m14
+  pmaxsw                          m8, m13
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+
+%ifidn %1, b_32x32
+  jmp .accumulate_eob
+.skip_iter:
+  mova        [qcoeffq+ncoeffq*4+ 0], m5
+  mova        [qcoeffq+ncoeffq*4+16], m5
+  mova        [qcoeffq+ncoeffq*4+32], m5
+  mova        [qcoeffq+ncoeffq*4+48], m5
+  mova       [dqcoeffq+ncoeffq*4+ 0], m5
+  mova       [dqcoeffq+ncoeffq*4+16], m5
+  mova       [dqcoeffq+ncoeffq*4+32], m5
+  mova       [dqcoeffq+ncoeffq*4+48], m5
+  add                        ncoeffq, mmsize
+  jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+  ; horizontally accumulate/max eobs and write into [eob] memory pointer
+  mov                             r2, eobmp
+  pshufd                          m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0xe
+  pmaxsw                          m8, m7
+  pshuflw                         m7, m8, 0x1
+  pmaxsw                          m8, m7
+  pextrw                          r6, m8, 0
+  mov                             [r2], r6
+  RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 0000000000..5b040a278a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,202 @@
+/*
+ *  Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+                                 const int16_t *round_ptr, __m128i *round,
+                                 const int16_t *quant_ptr, __m128i *quant,
+                                 const int16_t *dequant_ptr, __m128i *dequant,
+                                 const int16_t *shift_ptr, __m128i *shift) {
+  *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+  *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi16(a, sign);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+  a = _mm_xor_si128(a, sign);
+  return _mm_sub_epi32(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+                                    const __m128i quant, const __m128i shift) {
+  __m128i tmp, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+                                              const __m128i round,
+                                              const __m128i quant,
+                                              const __m128i *shift,
+                                              const int *log_scale) {
+  __m128i tmp, tmp1, qcoeff;
+  qcoeff = _mm_adds_epi16(*coeff, round);
+  tmp = _mm_mulhi_epi16(qcoeff, quant);
+  qcoeff = _mm_add_epi16(tmp, qcoeff);
+  tmp = _mm_mullo_epi16(qcoeff, *shift);
+  tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+  tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+  tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+  *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+  return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+                                                         __m128i dequant,
+                                                         const __m128i zero,
+                                                         tran_low_t *dqcoeff,
+                                                         const int *log_scale) {
+  // calculate abs
+  __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+  __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+  const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+  const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+  const __m128i low = _mm_mullo_epi16(coeff, dequant);
+  const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+  __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+  __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+  dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+  dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+  dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+  dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+  _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+  _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+                                   const __m128i zbin_mask0,
+                                   const __m128i zbin_mask1,
+                                   const int16_t *scan_ptr, const int index,
+                                   const __m128i zero) {
+  const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+  const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+  __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+  __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+  __m128i eob0, eob1;
+  // Add one to convert from indices to counts
+  scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+  scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+  eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+  eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+  return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+  eob = _mm_max_epi16(eob, eob_shuffled);
+  return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+  const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+  const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+  return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+                                      tran_low_t *coeff_ptr) {
+  assert(sizeof(tran_low_t) == 4);
+
+  __m128i one = _mm_set1_epi16(1);
+  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+                                const int16_t *iscan_ptr, int *is_found,
+                                __m128i *mask) {
+  __m128i all_zero;
+  __m128i temp_mask = _mm_setzero_si128();
+  all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+  if (_mm_movemask_epi8(all_zero)) {
+    __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+    __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+    __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+    __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+    temp_mask = _mm_max_epi16(mask0, mask1);
+    *is_found = 1;
+  }
+  *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+                                __m128i *threshold, const int16_t *iscan_ptr,
+                                int *is_found, __m128i *mask) {
+  __m128i zero = _mm_setzero_si128();
+  __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+  coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+  coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+  coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+  coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+  coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+  cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+  coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+  cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+  coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+  cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+  coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+  cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+  cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+  cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+  update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+  __m128i mask0, mask1;
+  int non_zero_count = 0;
+  mask0 = _mm_unpackhi_epi64(mask, mask);
+  mask1 = _mm_max_epi16(mask0, mask);
+  mask0 = _mm_shuffle_epi32(mask1, 1);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  mask1 = _mm_srli_epi32(mask0, 16);
+  mask0 = _mm_max_epi16(mask0, mask1);
+  non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+  return non_zero_count;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..0fea6ddfd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+                                                     const __m256i *sum_ref0,
+                                                     const __m256i *sum_ref1,
+                                                     const __m256i *sum_ref2,
+                                                     const __m256i *sum_ref3) {
+  // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+  // bytes are zeroed.
+  // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+  // 0, 0, 1, 1
+  __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+  // 2, 2, 3, 3
+  __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
+      _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
+      _MM_SHUFFLE(2, 0, 2, 0)));
+
+  // sum adjacent 32 bit integers
+  __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
+
+  // add the low 128 bit to the high 128 bit
+  __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
+                              _mm256_extractf128_si256(sum_ref0123, 1));
+
+  _mm_storeu_si128((__m128i *)(res), sum);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
+    int M, int N, const uint8_t *src, int src_stride,
+    const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  int i, j;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+      ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+      sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+    }
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+    ref3 += ref_stride;
+  }
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+    int M, int N, const uint8_t *src, int src_stride,
+    const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  int i, j;
+  const uint8_t *ref0, *ref1, *ref2;
+  const __m256i zero = _mm256_setzero_si256();
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (i = 0; i < N; i++) {
+    for (j = 0; j < M; j += 32) {
+      // load src and all refs
+      src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+      ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+      ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+      ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+      // sum of the absolute differences between every ref-i to src
+      ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+      ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+      ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+      // sum every ref-i
+      sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+      sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+      sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    }
+    src += src_stride;
+    ref0 += ref_stride;
+    ref1 += ref_stride;
+    ref2 += ref_stride;
+  }
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+#define SADMXN_AVX2(m, n)                                                      \
+  void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
+  }                                                                            \
+  void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                                  const uint8_t *const ref[4], int ref_stride, \
+                                  uint32_t res[4]) {                           \
+    aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res);           \
+  }
+
+SADMXN_AVX2(32, 8)
+SADMXN_AVX2(32, 16)
+SADMXN_AVX2(32, 32)
+SADMXN_AVX2(32, 64)
+
+SADMXN_AVX2(64, 16)
+SADMXN_AVX2(64, 32)
+SADMXN_AVX2(64, 64)
+SADMXN_AVX2(64, 128)
+
+SADMXN_AVX2(128, 64)
+SADMXN_AVX2(128, 128)
+
+#define SAD_SKIP_MXN_AVX2(m, n)                                             \
+  void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+                                        const uint8_t *const ref[4],        \
+                                        int ref_stride, uint32_t res[4]) {  \
+    aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref,             \
+                       2 * ref_stride, res);                                \
+    res[0] <<= 1;                                                           \
+    res[1] <<= 1;                                                           \
+    res[2] <<= 1;                                                           \
+    res[3] <<= 1;                                                           \
+  }
+
+SAD_SKIP_MXN_AVX2(32, 8)
+SAD_SKIP_MXN_AVX2(32, 16)
+SAD_SKIP_MXN_AVX2(32, 32)
+SAD_SKIP_MXN_AVX2(32, 64)
+
+SAD_SKIP_MXN_AVX2(64, 16)
+SAD_SKIP_MXN_AVX2(64, 32)
+SAD_SKIP_MXN_AVX2(64, 64)
+SAD_SKIP_MXN_AVX2(64, 128)
+
+SAD_SKIP_MXN_AVX2(128, 64)
+SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2;
+  const uint8_t *ref0, *ref1, *ref2;
+  const __m256i zero = _mm256_setzero_si256();
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+    ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+    ref3 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+#define SAD16XNX3_AVX2(n)                                                   \
+  void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+#define SAD16XNX4_AVX2(n)                                                   \
+  void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n)                                                 \
+  void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref[4],             \
+                                     int ref_stride, uint32_t res[4]) {       \
+    aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+                        res);                                                 \
+    res[0] <<= 1;                                                             \
+    res[1] <<= 1;                                                             \
+    res[2] <<= 1;                                                             \
+    res[3] <<= 1;                                                             \
+  }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..6edad99516
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,437 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; 'spill_src_stride' affect a lot how the code works.
+;
+; When 'spill_src_stride' is false, the 'src_strideq' resides in
+; register, [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update 'srcq'
+; at each line. We only update 'srcq' each two-lines using a compact
+; LEA instruction like [srcq+src_strideq*2].
+;
+; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_SECOND_OFFSET 0
+  %if spill_src_stride
+    %define second_offset 0
+    add srcq, src_strideq
+  %else
+    %define second_offset (src_strideq)
+  %endif
+%endmacro
+
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_TWO_LINES 0
+  %if spill_src_stride
+    add srcq, src_strideq
+  %else
+    lea                 srcq, [srcq+src_strideq*2]
+  %endif
+
+; note: ref_stride is never spilled when processing two lines
+  lea                ref1q, [ref1q+ref_strideq*2]
+  lea                ref2q, [ref2q+ref_strideq*2]
+  lea                ref3q, [ref3q+ref_strideq*2]
+  lea                ref4q, [ref4q+ref_strideq*2]
+%endmacro
+
+; PROCESS_4x2x4 first
+%macro PROCESS_4x2x4 1
+  movd                  m0, [srcq]
+  HANDLE_SECOND_OFFSET
+%if %1 == 1
+  movd                  m6, [ref1q]
+  movd                  m4, [ref2q]
+  movd                  m7, [ref3q]
+  movd                  m5, [ref4q]
+
+  movd                  m1, [srcq + second_offset]
+  movd                  m2, [ref1q+ref_strideq]
+  punpckldq             m0, m1
+  punpckldq             m6, m2
+  movd                  m1, [ref2q+ref_strideq]
+  movd                  m2, [ref3q+ref_strideq]
+  movd                  m3, [ref4q+ref_strideq]
+  punpckldq             m4, m1
+  punpckldq             m7, m2
+  punpckldq             m5, m3
+  movlhps               m0, m0
+  movlhps               m6, m4
+  movlhps               m7, m5
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movd                  m1, [ref1q]
+  movd                  m5, [ref1q+ref_strideq]
+  movd                  m2, [ref2q]
+  movd                  m4, [ref2q+ref_strideq]
+  punpckldq             m1, m5
+  punpckldq             m2, m4
+  movd                  m3, [ref3q]
+  movd                  m5, [ref3q+ref_strideq]
+  punpckldq             m3, m5
+  movd                  m4, [ref4q]
+  movd                  m5, [ref4q+ref_strideq]
+  punpckldq             m4, m5
+  movd                  m5, [srcq + second_offset]
+  punpckldq             m0, m5
+  movlhps               m0, m0
+  movlhps               m1, m2
+  movlhps               m3, m4
+  psadbw                m1, m0
+  psadbw                m3, m0
+  paddd                 m6, m1
+  paddd                 m7, m3
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first
+%macro PROCESS_8x2x4 1
+  movh                  m0, [srcq]
+  HANDLE_SECOND_OFFSET
+%if %1 == 1
+  movh                  m4, [ref1q]
+  movh                  m5, [ref2q]
+  movh                  m6, [ref3q]
+  movh                  m7, [ref4q]
+  movhps                m0, [srcq + second_offset]
+  movhps                m4, [ref1q+ref_strideq]
+  movhps                m5, [ref2q+ref_strideq]
+  movhps                m6, [ref3q+ref_strideq]
+  movhps                m7, [ref4q+ref_strideq]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%else
+  movh                  m1, [ref1q]
+  movh                  m2, [ref2q]
+  movhps                m0, [srcq + second_offset]
+  movhps                m1, [ref1q+ref_strideq]
+  movhps                m2, [ref2q+ref_strideq]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movh                  m1, [ref3q]
+  movhps                m1, [ref3q+ref_strideq]
+  movh                  m2, [ref4q]
+  movhps                m2, [ref4q+ref_strideq]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endif
+%endmacro
+
+; PROCESS_FIRST_MMSIZE
+%macro PROCESS_FIRST_MMSIZE 0
+  mova                  m0, [srcq]
+  movu                  m4, [ref1q]
+  movu                  m5, [ref2q]
+  movu                  m6, [ref3q]
+  movu                  m7, [ref4q]
+  psadbw                m4, m0
+  psadbw                m5, m0
+  psadbw                m6, m0
+  psadbw                m7, m0
+%endmacro
+
+; PROCESS_16x1x4 offset
+%macro PROCESS_16x1x4 1
+  mova                  m0, [srcq + %1]
+  movu                  m1, [ref1q + ref_offsetq + %1]
+  movu                  m2, [ref2q + ref_offsetq + %1]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m4, m1
+  paddd                 m5, m2
+
+  movu                  m1, [ref3q + ref_offsetq + %1]
+  movu                  m2, [ref4q + ref_offsetq + %1]
+  psadbw                m1, m0
+  psadbw                m2, m0
+  paddd                 m6, m1
+  paddd                 m7, m2
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
+;                         uint8_t *ref[4], int ref_stride,
+;                         uint32_t res[4]);
+; Macro Arguments:
+;   1: Width
+;   2: Height
+;   3: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-3 0
+
+%define spill_src_stride 0
+%define spill_ref_stride 0
+%define spill_cnt 0
+
+; Whether a shared offset should be used instead of adding strides to
+; each reference array. With this option, only one line will be processed
+; per loop iteration.
+%define use_ref_offset (%1 >= mmsize)
+
+; Remove loops in the 4x4 and 8x4 case
+%define use_loop (use_ref_offset || %2 > 4)
+
+%if %3 == 1  ; skip rows
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
+                                     ref2, ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
+                                    ref2, ref3, ref4, cnt
+%else
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
+                                    ref2, ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
+                                    ref4
+%define spill_src_stride 1
+%define spill_ref_stride 1
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
+                                    ref3, ref4
+%define spill_src_stride 1
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
+                                    ref3, ref4
+%endif
+%endif
+%else ; normal sad
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                               ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                              ref3, ref4, cnt
+%else
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+                              ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
+  %define spill_src_stride 1
+  %define spill_ref_stride 1
+%elif use_loop
+cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
+  %define spill_src_stride 1
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
+                              ref4
+%endif
+%endif
+%endif
+
+%if spill_src_stride
+  %define src_strideq r1mp
+  %define src_strided r1mp
+%endif
+%if spill_ref_stride
+  %define ref_strideq r3mp
+  %define ref_strided r3mp
+%endif
+
+%if spill_cnt
+  SUB                  rsp, 4
+  %define cntd word [rsp]
+%endif
+
+%if %3 == 1
+  sal          src_strided, 1
+  sal          ref_strided, 1
+%endif
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+
+  mov                ref2q, [ref1q+gprsize*1]
+  mov                ref3q, [ref1q+gprsize*2]
+  mov                ref4q, [ref1q+gprsize*3]
+  mov                ref1q, [ref1q+gprsize*0]
+
+; Is the loop for this wxh in another function?
+; If so, we jump into that function for the loop and returning
+%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
+
+%if use_ref_offset
+  PROCESS_FIRST_MMSIZE
+%if %1 > mmsize
+  mov          ref_offsetq, 0
+  mov                 cntd, %2 >> %3
+; Jump part way into the loop for the square version of this width
+%if %3 == 1
+  jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
+%else
+  jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
+%endif
+%else
+  mov          ref_offsetq, ref_strideq
+  add                 srcq, src_strideq
+  mov                 cntd, (%2 >> %3) - 1
+%endif
+%if external_loop == 0
+.loop:
+; Unrolled horizontal loop
+%assign h_offset 0
+%rep %1/mmsize
+  PROCESS_16x1x4 h_offset
+%if h_offset == 0
+; The first row of the first column is done outside the loop and jumps here
+.midloop:
+%endif
+%assign h_offset h_offset+mmsize
+%endrep
+
+  add                 srcq, src_strideq
+  add          ref_offsetq, ref_strideq
+  sub                 cntd, 1
+  jnz .loop
+%endif
+%else
+  PROCESS_%1x2x4 1
+  ADVANCE_END_OF_TWO_LINES
+%if use_loop
+  mov                 cntd, (%2/2 >> %3) - 1
+.loop:
+%endif
+  PROCESS_%1x2x4 0
+%if use_loop
+  ADVANCE_END_OF_TWO_LINES
+  sub                 cntd, 1
+  jnz .loop
+%endif
+%endif
+
+%if spill_cnt
+; Undo stack allocation for cnt
+  ADD                  rsp, 4
+%endif
+
+%if external_loop == 0
+%if %3 == 0
+  %define resultq r4
+  %define resultmp r4mp
+%endif
+
+; Undo modifications on parameters on the stack
+%if %3 == 1
+%if spill_src_stride
+  shr          src_strided, 1
+%endif
+%if spill_ref_stride
+  shr          ref_strided, 1
+%endif
+%endif
+
+%if %1 > 4
+  pslldq                m5, 4
+  pslldq                m7, 4
+  por                   m4, m5
+  por                   m6, m7
+  mova                  m5, m4
+  mova                  m7, m6
+  punpcklqdq            m4, m6
+  punpckhqdq            m5, m7
+  paddd                 m4, m5
+%if %3 == 1
+  pslld                 m4, 1
+%endif
+  movifnidn             resultq, resultmp
+  movu                [resultq], m4
+  RET
+%else
+  pshufd            m6, m6, 0x08
+  pshufd            m7, m7, 0x08
+%if %3 == 1
+  pslld                 m6, 1
+  pslld                 m7, 1
+%endif
+  movifnidn             resultq, resultmp
+  movq              [resultq+0], m6
+  movq              [resultq+8], m7
+  RET
+%endif
+%endif ; external_loop == 0
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 128, 128
+SADNXN4D 128,  64
+SADNXN4D  64, 128
+SADNXN4D  64,  64
+SADNXN4D  64,  32
+SADNXN4D  32,  64
+SADNXN4D  32,  32
+SADNXN4D  32,  16
+SADNXN4D  16,  32
+SADNXN4D  16,  16
+SADNXN4D  16,   8
+SADNXN4D   8,  16
+SADNXN4D   8,   8
+SADNXN4D   8,   4
+SADNXN4D   4,   8
+SADNXN4D   4,   4
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16
+SADNXN4D  16,   4
+SADNXN4D   8,  32
+SADNXN4D  32,   8
+SADNXN4D  16,  64
+SADNXN4D  64,  16
+%endif
+SADNXN4D 128, 128, 1
+SADNXN4D 128,  64, 1
+SADNXN4D  64, 128, 1
+SADNXN4D  64,  64, 1
+SADNXN4D  64,  32, 1
+SADNXN4D  32,  64, 1
+SADNXN4D  32,  32, 1
+SADNXN4D  32,  16, 1
+SADNXN4D  16,  32, 1
+SADNXN4D  16,  16, 1
+SADNXN4D  16,   8, 1
+SADNXN4D   8,  16, 1
+SADNXN4D   8,   8, 1
+SADNXN4D   4,   8, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D   4,  16, 1
+SADNXN4D   8,  32, 1
+SADNXN4D  32,   8, 1
+SADNXN4D  16,  64, 1
+SADNXN4D  64,  16, 1
+%endif
+
+; Different assembly is needed when the height gets subsampled to 2
+; SADNXN4D 16,  4, 1
+; SADNXN4D  8,  4, 1
+; SADNXN4D  4,  4, 1
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..24cea76b37
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+                                        const uint8_t *ref_ptr, int ref_stride,
+                                        int h) {
+  int i;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  int ref2_stride = ref_stride << 1;
+  int src2_stride = src_stride << 1;
+  int max = h >> 1;
+  for (i = 0; i < max; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref2_stride;
+    src_ptr += src2_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+  _mm256_zeroupper();
+  return res;
+}
+
+#define FSAD64_H(h)                                                           \
+  unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS64_H(h)                                                          \
+  unsigned int aom_sad_skip_64x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+#define FSAD32_H(h)                                                           \
+  unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride,   \
+                                    const uint8_t *ref_ptr, int ref_stride) { \
+    return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h);         \
+  }
+
+#define FSADS32_H(h)                                                          \
+  unsigned int aom_sad_skip_32x##h##_avx2(                                    \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride) {                                                       \
+    return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+                            h / 2);                                           \
+  }
+
+#define FSAD64  \
+  FSAD64_H(64)  \
+  FSAD64_H(32)  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
+
+#define FSAD32  \
+  FSAD32_H(64)  \
+  FSAD32_H(32)  \
+  FSAD32_H(16)  \
+  FSADS32_H(64) \
+  FSADS32_H(32) \
+  FSADS32_H(16)
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h)                                                        \
+  unsigned int aom_sad64x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i;                                                                    \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    for (i = 0; i < h; i++) {                                                 \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));         \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));     \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref_stride;                                                  \
+      src_ptr += src_stride;                                                  \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG32_H(h)                                                        \
+  unsigned int aom_sad32x##h##_avg_avx2(                                      \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,         \
+      int ref_stride, const uint8_t *second_pred) {                           \
+    int i;                                                                    \
+    __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;                           \
+    __m256i sum_sad = _mm256_setzero_si256();                                 \
+    __m256i sum_sad_h;                                                        \
+    __m128i sum_sad128;                                                       \
+    int ref2_stride = ref_stride << 1;                                        \
+    int src2_stride = src_stride << 1;                                        \
+    int max = h >> 1;                                                         \
+    for (i = 0; i < max; i++) {                                               \
+      ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);                \
+      ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+      ref1_reg = _mm256_avg_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));        \
+      ref2_reg = _mm256_avg_epu8(                                             \
+          ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+      sad1_reg = _mm256_sad_epu8(                                             \
+          ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));            \
+      sad2_reg = _mm256_sad_epu8(                                             \
+          ref2_reg,                                                           \
+          _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));       \
+      sum_sad =                                                               \
+          _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));    \
+      ref_ptr += ref2_stride;                                                 \
+      src_ptr += src2_stride;                                                 \
+      second_pred += 64;                                                      \
+    }                                                                         \
+    sum_sad_h = _mm256_srli_si256(sum_sad, 8);                                \
+    sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);                           \
+    sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);                        \
+    sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);  \
+    unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);           \
+    _mm256_zeroupper();                                                       \
+    return res;                                                               \
+  }
+
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
+
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 0000000000..c5da6e9ab3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  __m256i s1, s2, r1, r2;
+  __m256i sum = _mm256_setzero_si256();
+  __m128i sum_i128;
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+    s2 = _mm256_sad_epu8(
+        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+    ref_ptr += ref_stride << 1;
+    src_ptr += src_stride << 1;
+  }
+
+  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+                           _mm256_castsi256_si128(sum));
+  return (unsigned int)_mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 32;
+  uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+                             const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 5;
+  ref_ptr += ref_stride << 5;
+  sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                 const uint8_t *ref_ptr, int ref_stride) {
+  uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+  return sum;
+}
+
+unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t half_width = 64;
+  uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                      const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
+unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride) {
+  const uint32_t sum =
+      aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+  return 2 * sum;
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const int h, const uint8_t *second_pred,
+                                     const int second_pred_stride) {
+  int i;
+  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+  __m256i sum_sad = _mm256_setzero_si256();
+  __m256i sum_sad_h;
+  __m128i sum_sad128;
+  for (i = 0; i < h; i++) {
+    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+    ref1_reg = _mm256_avg_epu8(
+        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+    ref2_reg = _mm256_avg_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+    sad1_reg =
+        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+    sad2_reg = _mm256_sad_epu8(
+        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+    ref_ptr += ref_stride;
+    src_ptr += src_stride;
+    second_pred += second_pred_stride;
+  }
+  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+  return (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 64);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 64 << 6;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 64);
+  return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                    const uint8_t *ref_ptr, int ref_stride,
+                                    const uint8_t *second_pred) {
+  unsigned int half_width = 64;
+  uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                                  second_pred, 128);
+  src_ptr += half_width;
+  ref_ptr += half_width;
+  second_pred += half_width;
+  sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+                          second_pred, 128);
+  return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+                                     const uint8_t *ref_ptr, int ref_stride,
+                                     const uint8_t *second_pred) {
+  uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+                                        ref_stride, second_pred);
+  src_ptr += src_stride << 6;
+  ref_ptr += ref_stride << 6;
+  second_pred += 128 << 6;
+  sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+                                second_pred);
+  return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..dbe8ca3161
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro SAD_FN 4
+%if %4 == 0 ; normal sad
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+                            src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+                                    second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
+                                              ref, ref_stride, \
+                                              second_pred, \
+                                              src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea           src_strided, [src_strided*2]
+lea           ref_strided, [ref_strided*2]
+%endif ; %4 skip
+  movsxdifnidn src_strideq, src_strided
+  movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+  lea         src_stride3q, [src_strideq*3]
+  lea         ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+;                                  uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+  SAD_FN 128, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
+  mov              n_rowsd, %1
+%endif
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*4]
+  pavgb                 m2, [second_predq+mmsize*5]
+  pavgb                 m3, [second_predq+mmsize*6]
+  pavgb                 m4, [second_predq+mmsize*7]
+  lea         second_predq, [second_predq+mmsize*8]
+%endif
+  psadbw                m1, [srcq+64]
+  psadbw                m2, [srcq+80]
+  psadbw                m3, [srcq+96]
+  psadbw                m4, [srcq+112]
+
+  add                 refq, ref_strideq
+  add                 srcq, src_strideq
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  sub              n_rowsd, 1
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128     ; sad128x128_sse2
+SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 128, 2  ; sad128x128_skip_sse2
+SAD128XN 64      ; sad128x64_sse2
+SAD128XN 64, 1   ; sad128x64_avg_sse2
+SAD128XN 64, 2   ; sad128x64_skip_sse2
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+  SAD_FN 64, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/2
+%else
+  mov              n_rowsd, %1
+%endif
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  add                 refq, ref_strideq
+  paddd                 m0, m1
+  add                 srcq, src_strideq
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 128     ; sad64x128_sse2
+SAD64XN  64     ; sad64x64_sse2
+SAD64XN  32     ; sad64x32_sse2
+SAD64XN  16     ; sad64x16_sse2
+SAD64XN 128, 1  ; sad64x128_avg_sse2
+SAD64XN  64, 1  ; sad64x64_avg_sse2
+SAD64XN  32, 1  ; sad64x32_avg_sse2
+SAD64XN  16, 1  ; sad64x16_avg_sse2
+SAD64XN 128, 2  ; sad64x128_skip_sse2
+SAD64XN  64, 2  ; sad64x64_skip_sse2
+SAD64XN  32, 2  ; sad64x32_skip_sse2
+SAD64XN  16, 2  ; sad64x16_skip_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+;                                uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+  SAD_FN 32, %1, 5, %2
+%if %2 == 2
+  mov              n_rowsd, %1/4
+%else
+  mov              n_rowsd, %1/2
+%endif
+  pxor                  m0, m0
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+ref_strideq]
+  movu                  m4, [refq+ref_strideq+16]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+src_strideq]
+  psadbw                m4, [srcq+src_strideq+16]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*2]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*2]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64    ; sad32x64_sse2
+SAD32XN 32    ; sad32x32_sse2
+SAD32XN 16    ; sad32x16_sse2
+SAD32XN  8    ; sad_32x8_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN  8, 1 ; sad_32x8_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+SAD32XN  8, 2 ; sad_32x8_skip_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                    uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+  SAD_FN 16, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
+  mov              n_rowsd, %1/4
+%endif
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+ref_strideq]
+  movu                  m3, [refq+ref_strideq*2]
+  movu                  m4, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+  lea         second_predq, [second_predq+mmsize*4]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+src_strideq]
+  psadbw                m3, [srcq+src_strideq*2]
+  psadbw                m4, [srcq+src_stride3q]
+  paddd                 m1, m2
+  paddd                 m3, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m3
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 64    ; sad_16x64_sse2
+SAD16XN 32    ; sad16x32_sse2
+SAD16XN 16    ; sad16x16_sse2
+SAD16XN  8    ; sad16x8_sse2
+SAD16XN  4    ; sad_16x4_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN  8, 1 ; sad16x8_avg_sse2
+SAD16XN  4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64, 2 ; sad_16x64_skip_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN  8, 2 ; sad16x8_skip_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+  SAD_FN 8, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
+  mov              n_rowsd, %1/4
+%endif
+  pxor                  m0, m0
+
+.loop:
+  movh                  m1, [refq]
+  movhps                m1, [refq+ref_strideq]
+  movh                  m2, [refq+ref_strideq*2]
+  movhps                m2, [refq+ref_stride3q]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  lea         second_predq, [second_predq+mmsize*2]
+%endif
+  movh                  m3, [srcq]
+  movhps                m3, [srcq+src_strideq]
+  movh                  m4, [srcq+src_strideq*2]
+  movhps                m4, [srcq+src_stride3q]
+  psadbw                m1, m3
+  psadbw                m2, m4
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  paddd                 m0, m2
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 32    ; sad_8x32_sse2
+SAD8XN 16    ; sad8x16_sse2
+SAD8XN  8    ; sad8x8_sse2
+SAD8XN  4    ; sad8x4_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN  8, 1 ; sad8x8_avg_sse2
+SAD8XN  4, 1 ; sad8x4_avg_sse2
+SAD8XN 32, 2 ; sad_8x32_skip_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN  8, 2 ; sad8x8_skip_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+;                                   uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+  SAD_FN 4, %1, 7, %2
+%if %2 == 2
+  mov              n_rowsd, %1/8
+%else
+  mov              n_rowsd, %1/4
+%endif
+  pxor                  m0, m0
+
+.loop:
+  movd                  m1, [refq]
+  movd                  m2, [refq+ref_strideq]
+  movd                  m3, [refq+ref_strideq*2]
+  movd                  m4, [refq+ref_stride3q]
+  punpckldq             m1, m2
+  punpckldq             m3, m4
+  movlhps               m1, m3
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  lea         second_predq, [second_predq+mmsize*1]
+%endif
+  movd                  m2, [srcq]
+  movd                  m5, [srcq+src_strideq]
+  movd                  m4, [srcq+src_strideq*2]
+  movd                  m3, [srcq+src_stride3q]
+  punpckldq             m2, m5
+  punpckldq             m4, m3
+  movlhps               m2, m4
+  psadbw                m1, m2
+  lea                 refq, [refq+ref_strideq*4]
+  paddd                 m0, m1
+  lea                 srcq, [srcq+src_strideq*4]
+  dec              n_rowsd
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+  pslld                 m0, 1
+%endif
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 16 ; sad_4x16_sse2
+SAD4XN  8 ; sad4x8_sse
+SAD4XN  4 ; sad4x4_sse
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
+SAD4XN  8, 1 ; sad4x8_avg_sse
+SAD4XN  4, 1 ; sad4x4_avg_sse
+SAD4XN 16, 2 ; sad_4x16_skip_sse2
+SAD4XN  8, 2 ; sad4x8_skip_sse
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..c5a5f5c234
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+                                const uint8_t *b) {
+  const __m256i v_a0 = yy_loadu_256(a);
+  const __m256i v_b0 = yy_loadu_256(b);
+  const __m256i zero = _mm256_setzero_si256();
+  const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+  const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+  const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+  const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+  const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+  const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+  int64_t sum;
+  __m256i zero = _mm256_setzero_si256();
+  const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+  const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+  const __m256i sum0_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+  const __m256i sum1_4x64 =
+      _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+  const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+  *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+  int64_t sum;
+  const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+                                         _mm256_extracti128_si256(sum_4x64, 1));
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+  const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+                                             _mm_unpacklo_epi32(v_a2, v_a3));
+  const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+                                             _mm_unpacklo_epi32(v_b2, v_b3));
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m256i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+  const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+                     int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  __m256i sum = _mm256_setzero_si256();
+  __m256i zero = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        const __m128i v_a0 = xx_loadu_128(a);
+        const __m128i v_a1 = xx_loadu_128(a + a_stride);
+        const __m128i v_b0 = xx_loadu_128(b);
+        const __m128i v_b1 = xx_loadu_128(b + b_stride);
+        const __m256i v_a =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+        const __m256i v_b =
+            _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+        const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+        const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+        const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+        const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+        const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+        const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+        const __m256i temp =
+            _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+                             _mm256_madd_epi16(v_bsub, v_bsub));
+        sum = _mm256_add_epi32(sum, temp);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 64:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 128:
+      do {
+        sse_w32_avx2(&sum, a, b);
+        sse_w32_avx2(&sum, a + 32, b + 32);
+        sse_w32_avx2(&sum, a + 64, b + 64);
+        sse_w32_avx2(&sum, a + 96, b + 96);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    default:
+      if ((width & 0x07) == 0) {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride << 1;
+          b += b_stride << 1;
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+            const uint8_t *a2 = a + i + (a_stride << 1);
+            const uint8_t *b2 = b + i + (b_stride << 1);
+            sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      }
+      sse = summary_all_avx2(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+                                       const uint16_t *b) {
+  const __m256i v_a_w = yy_loadu_256(a);
+  const __m256i v_b_w = yy_loadu_256(b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+  const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+  const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+  const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+                                     _mm_unpacklo_epi64(v_a2, v_a3));
+  const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+                                     _mm_unpacklo_epi64(v_b2, v_b3));
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+                                        int a_stride, const uint16_t *b,
+                                        int b_stride) {
+  const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+  const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+  const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+                            int b_stride, int width, int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m256i sum = _mm256_setzero_si256();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 2;
+        b += b_stride << 2;
+        y += 4;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 16:
+      do {
+        highbd_sse_w16_avx2(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_avx2(&sum);
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m256i sum32 = _mm256_setzero_si256();
+        do {
+          highbd_sse_w16_avx2(&sum32, a, b);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+          highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_avx2(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      sse = summary_4x64_avx2(sum);
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          int i = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+            const uint16_t *a2 = a + i + (a_stride << 1);
+            const uint16_t *b2 = b + i + (b_stride << 1);
+            highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+          summary_32_avx2(&sum32, &sum);
+          a += a_stride << 2;
+          b += b_stride << 2;
+          y += 4;
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m256i sum32 = _mm256_setzero_si256();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+              i += 8;
+            } while (i < width);
+            a += a_stride << 1;
+            b += b_stride << 1;
+            l += 2;
+          } while (l < 8 && l < (height - y));
+          summary_32_avx2(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      sse = summary_4x64_avx2(sum);
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..7e74554d75
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+  int64_t sum;
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+  const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+  const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+  xx_storel_64(&sum, sum_1x64);
+  return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+  const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+  const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+  *sum64 = _mm_add_epi64(sum0, *sum64);
+  *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+                                  const uint8_t *b) {
+  const __m128i v_a0 = xx_loadu_128(a);
+  const __m128i v_b0 = xx_loadu_128(b);
+  const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+  const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+  const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+  const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+                                 const uint8_t *b, int b_stride, __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_32(a);
+  const __m128i v_a1 = xx_loadl_32(a + a_stride);
+  const __m128i v_b0 = xx_loadl_32(b);
+  const __m128i v_b1 = xx_loadl_32(b + b_stride);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+  const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+                               __m128i *sum) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+  const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+                       int b_stride, int width, int height) {
+  int y = 0;
+  int64_t sse = 0;
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        sse8_sse4_1(a, b, &sum);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 32:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16, b + 16);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 64:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 128:
+      do {
+        sse_w16_sse4_1(&sum, a, b);
+        sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+        sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+        sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+        sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+        sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+        sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+        sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    default:
+      if (width & 0x07) {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+            i += 8;
+          } while (i + 4 < width);
+          sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+        } while (y < height);
+      } else {
+        do {
+          int i = 0;
+          do {
+            sse8_sse4_1(a + i, b + i, &sum);
+            i += 8;
+          } while (i < width);
+          a += a_stride;
+          b += b_stride;
+          y += 1;
+        } while (y < height);
+      }
+      sse = summary_all_sse4(&sum);
+      break;
+  }
+
+  return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+                                          int a_stride, const uint16_t *b,
+                                          int b_stride) {
+  const __m128i v_a0 = xx_loadl_64(a);
+  const __m128i v_a1 = xx_loadl_64(a + a_stride);
+  const __m128i v_b0 = xx_loadl_64(b);
+  const __m128i v_b1 = xx_loadl_64(b + b_stride);
+  const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+  const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+                                        const uint16_t *b) {
+  const __m128i v_a_w = xx_loadu_128(a);
+  const __m128i v_b_w = xx_loadu_128(b);
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+  *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride, int width,
+                              int height) {
+  int32_t y = 0;
+  int64_t sse = 0;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  __m128i sum = _mm_setzero_si128();
+  switch (width) {
+    case 4:
+      do {
+        highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+        a += a_stride << 1;
+        b += b_stride << 1;
+        y += 2;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 8:
+      do {
+        highbd_sse_w8_sse4_1(&sum, a, b);
+        a += a_stride;
+        b += b_stride;
+        y += 1;
+      } while (y < height);
+      sse = summary_all_sse4(&sum);
+      break;
+    case 16:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 64 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 64;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 32:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 32 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 32;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 64:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 16 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 16;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    case 128:
+      do {
+        int l = 0;
+        __m128i sum32 = _mm_setzero_si128();
+        do {
+          highbd_sse_w8_sse4_1(&sum32, a, b);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+          highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+          a += a_stride;
+          b += b_stride;
+          l += 1;
+        } while (l < 8 && l < (height - y));
+        summary_32_sse4(&sum32, &sum);
+        y += 8;
+      } while (y < height);
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+    default:
+      if (width & 0x7) {
+        do {
+          __m128i sum32 = _mm_setzero_si128();
+          int i = 0;
+          do {
+            highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+            highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+            i += 8;
+          } while (i + 4 < width);
+          highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+          a += (a_stride << 1);
+          b += (b_stride << 1);
+          y += 2;
+          summary_32_sse4(&sum32, &sum);
+        } while (y < height);
+      } else {
+        do {
+          int l = 0;
+          __m128i sum32 = _mm_setzero_si128();
+          do {
+            int i = 0;
+            do {
+              highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+              i += 8;
+            } while (i < width);
+            a += a_stride;
+            b += b_stride;
+            l += 1;
+          } while (l < 8 && l < (height - y));
+          summary_32_sse4(&sum32, &sum);
+          y += 8;
+        } while (y < height);
+      }
+      xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+      break;
+  }
+  return sse;
+}
+#endif  // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
new file mode 100644
index 0000000000..49bc655336
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+        paddusw         xmm15, xmm3  ; sum_s
+        paddusw         xmm14, xmm4  ; sum_r
+        movdqa          xmm1, xmm3
+        pmaddwd         xmm1, xmm1
+        paddd           xmm13, xmm1 ; sum_sq_s
+        movdqa          xmm2, xmm4
+        pmaddwd         xmm2, xmm2
+        paddd           xmm12, xmm2 ; sum_sq_r
+        pmaddwd         xmm3, xmm4
+        paddd           xmm11, xmm3  ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+        movdqa          xmm2,%1
+        punpckldq       %1,xmm0
+        punpckhdq       xmm2,xmm0
+        paddq           %1,xmm2
+        movdqa          xmm2,%1
+        punpcklqdq      %1,xmm0
+        punpckhqdq      xmm2,xmm0
+        paddq           %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+        movdqa          xmm1, %1
+        punpcklwd       %1,xmm0
+        punpckhwd       xmm1,xmm0
+        paddd           %1, xmm1
+        SUM_ACROSS_Q    %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_16x16_sse2)
+sym(aom_ssim_parms_16x16_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 16      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movdqu          xmm5, [rsi]
+    movdqu          xmm6, [rdi]
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpckhbw       xmm3, xmm0 ; high_s
+    punpckhbw       xmm4, xmm0 ; high_r
+
+    TABULATE_SSIM
+
+    movdqa          xmm3, xmm5
+    movdqa          xmm4, xmm6
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void ssim_parms_sse2(
+;    unsigned char *s,
+;    int sp,
+;    unsigned char *r,
+;    int rp
+;    uint32_t *sum_s,
+;    uint32_t *sum_r,
+;    uint32_t *sum_sq_s,
+;    uint32_t *sum_sq_r,
+;    uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_8x8_sse2)
+sym(aom_ssim_parms_8x8_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    SAVE_XMM 15
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov             rsi,        arg(0) ;s
+    mov             rcx,        arg(1) ;sp
+    mov             rdi,        arg(2) ;r
+    mov             rax,        arg(3) ;rp
+
+    pxor            xmm0, xmm0
+    pxor            xmm15,xmm15  ;sum_s
+    pxor            xmm14,xmm14  ;sum_r
+    pxor            xmm13,xmm13  ;sum_sq_s
+    pxor            xmm12,xmm12  ;sum_sq_r
+    pxor            xmm11,xmm11  ;sum_sxr
+
+    mov             rdx, 8      ;row counter
+.NextRow:
+
+    ;grab source and reference pixels
+    movq            xmm3, [rsi]
+    movq            xmm4, [rdi]
+    punpcklbw       xmm3, xmm0 ; low_s
+    punpcklbw       xmm4, xmm0 ; low_r
+
+    TABULATE_SSIM
+
+    add             rsi, rcx   ; next s row
+    add             rdi, rax   ; next r row
+
+    dec             rdx        ; counter
+    jnz .NextRow
+
+    SUM_ACROSS_W    xmm15
+    SUM_ACROSS_W    xmm14
+    SUM_ACROSS_Q    xmm13
+    SUM_ACROSS_Q    xmm12
+    SUM_ACROSS_Q    xmm11
+
+    mov             rdi,arg(4)
+    movd            [rdi], xmm15;
+    mov             rdi,arg(5)
+    movd            [rdi], xmm14;
+    mov             rdi,arg(6)
+    movd            [rdi], xmm13;
+    mov             rdi,arg(7)
+    movd            [rdi], xmm12;
+    mov             rdi,arg(8)
+    movd            [rdi], xmm11;
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..d1d8373456
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1470 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times  8 dw  8
+bilin_filter_m_sse2: times  8 dw 16
+                     times  8 dw  0
+                     times  8 dw 14
+                     times  8 dw  2
+                     times  8 dw 12
+                     times  8 dw  4
+                     times  8 dw 10
+                     times  8 dw  6
+                     times 16 dw  8
+                     times  8 dw  6
+                     times  8 dw 10
+                     times  8 dw  4
+                     times  8 dw 12
+                     times  8 dw  2
+                     times  8 dw 14
+
+bilin_filter_m_ssse3: times  8 db 16,  0
+                      times  8 db 14,  2
+                      times  8 db 12,  4
+                      times  8 db 10,  6
+                      times 16 db  8
+                      times  8 db  6, 10
+                      times  8 db  4, 12
+                      times  8 db  2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+;                               int x_offset, int y_offset,
+;                               const uint8_t *dst, ptrdiff_t dst_stride,
+;                               int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+  psubw                %3, %4
+  psubw                %1, %2
+  paddw                %5, %3
+  pmaddwd              %3, %3
+  paddw                %5, %1
+  pmaddwd              %1, %1
+  paddd                %6, %3
+  paddd                %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+  ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+  ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+  ; We have to sign-extend it before adding the words within the register
+  ; and outputing to a dword.
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  movhlps              m3, m7
+  punpcklwd            m4, m6, m5
+  punpckhwd            m6, m5           ; sign-extend m6 word->dword
+  paddd                m7, m3
+  paddd                m6, m4
+  pshufd               m3, m7, 0x1
+  movhlps              m4, m6
+  paddd                m7, m3
+  paddd                m6, m4
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  pshufd               m4, m6, 0x1
+  movd               [r1], m7           ; store sse
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%else ; 4xh
+  pshuflw              m4, m6, 0xe
+  pshuflw              m3, m7, 0xe
+  paddw                m6, m4
+  paddd                m7, m3
+  pcmpgtw              m5, m6           ; mask for 0 > x
+  mov                  r1, ssem         ; r1 = unsigned int *sse
+  punpcklwd            m6, m5           ; sign-extend m6 word->dword
+  movd               [r1], m7           ; store sse
+  pshuflw              m4, m6, 0xe
+  paddd                m6, m4
+  movd               raxd, m6           ; store sum as return value
+%endif
+  RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE  0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+  add                srcq, src_stridemp
+%else
+  add                srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if AOM_ARCH_X86_64
+  %if %2 == 1 ; avg
+    cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+                                        x_offset, y_offset, dst, dst_stride, \
+                                        sec, sec_stride, height, sse
+    %define sec_str sec_strideq
+  %else
+    cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+                                    x_offset, y_offset, dst, dst_stride, \
+                                    height, sse
+  %endif
+  %define block_height heightd
+  %define bilin_filter sseq
+%else
+  %if CONFIG_PIC=1
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, dst, dst_stride, \
+                                          sec, sec_stride, height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
+      %define block_height heightd
+    %endif
+
+    ; reuse argument stack space
+    %define g_bilin_filterm x_offsetm
+    %define g_pw_8m y_offsetm
+
+    ;Store bilin_filter and pw_8 location in stack
+    %if GET_GOT_DEFINED == 1
+      GET_GOT eax
+      add esp, 4                ; restore esp
+    %endif
+
+    lea ecx, [GLOBAL(bilin_filter_m)]
+    mov g_bilin_filterm, ecx
+
+    lea ecx, [GLOBAL(pw_8)]
+    mov g_pw_8m, ecx
+
+    LOAD_IF_USED 0, 1         ; load eax, ecx back
+  %else
+    %if %2 == 1 ; avg
+      cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+                                          x_offset, y_offset, \
+                                          dst, dst_stride, sec, sec_stride, \
+                                          height, sse
+      %define block_height dword heightm
+      %define sec_str sec_stridemp
+    %else
+      cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+                                      x_offset, y_offset, dst, dst_stride, \
+                                      height, sse
+      %define block_height heightd
+    %endif
+    %define bilin_filter bilin_filter_m
+  %endif
+%endif
+
+%if %1 == 4
+  %define movx movd
+%else
+  %define movx movh
+%endif
+
+  ASSERT               %1 <= 16         ; m6 overflows if w > 16
+  pxor                 m6, m6           ; sum
+  pxor                 m7, m7           ; sse
+  ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+  ; could perhaps use it for something more productive then
+  pxor                 m5, m5           ; dedicated zero register
+%if %1 < 16
+  sar                   block_height, 1
+%if %2 == 1 ; avg
+  shl             sec_str, 1
+%endif
+%endif
+
+  ; FIXME(rbultje) replace by jumptable?
+  test          x_offsetd, x_offsetd
+  jnz .x_nonzero
+  ; x_offset == 0
+  test          y_offsetd, y_offsetd
+  jnz .x_zero_y_nonzero
+
+  ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  mova                 m1, [dstq]
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+
+%if %2 == 0 ; !avg
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+%endif
+
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_zero_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_zero_y_nonhalf
+
+  ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq*2]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq*2]
+  punpckldq            m2, m1
+%endif
+  movx                 m1, [dstq]
+%if %1 > 4
+  movlhps              m0, m2
+%else ; 4xh
+  punpckldq            m0, m2
+%endif
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m3, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m4, [secq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_half_loop
+  STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+  ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+src_strideq]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+  ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+  ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+  ; slightly faster because of pmullw latency. It would also cut our rodata
+  ; tables in half for this function, and save 1-2 registers on x86-64.
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq*2]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m4, filter_y_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonzero:
+  cmp           x_offsetd, 4
+  jne .x_nonhalf
+  ; x_offset == 0.5
+  test          y_offsetd, y_offsetd
+  jnz .x_half_y_nonzero
+
+  ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+%if %2 == 1 ; avg
+  pavgb                m0, [secq]
+%endif
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m0, [srcq+src_strideq]
+  movhps               m4, [srcq+src_strideq+1]
+%else ; 4xh
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m0, m1
+  movx                 m2, [srcq+src_strideq+1]
+  punpckldq            m4, m2
+%endif
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m0, m4
+  punpcklbw            m3, m5
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpcklbw            m1, m5
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else ; 4xh
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m1, m5
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m2, [srcq+src_strideq]
+  movx                 m1, [dstq]
+  pavgb                m0, m4
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+  pavgb                m2, m4
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_zero_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_half_y_nonhalf
+
+  ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m3
+  punpckhbw            m3, m1, m5
+  pavgb                m0, m4
+%if %2 == 1 ; avg
+  punpcklbw            m1, m5
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+  movhps               m2, [srcq+src_strideq]
+  movhps               m3, [srcq+src_strideq+1]
+%else
+  movx                 m1, [srcq+src_strideq]
+  punpckldq            m2, m1
+  movx                 m1, [srcq+src_strideq+1]
+  punpckldq            m3, m1
+%endif
+  pavgb                m2, m3
+%if %1 > 4
+  movlhps              m0, m2
+  movhlps              m4, m2
+%else ; 4xh
+  punpckldq            m0, m2
+  pshuflw              m4, m2, 0xe
+%endif
+  movx                 m1, [dstq]
+  pavgb                m0, m2
+  movx                 m3, [dstq+dst_strideq]
+%if %1 > 4
+  pavgb                m0, [secq]
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%if %1 > 4
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%else ; !avg
+  movx                 m4, [srcq+src_strideq]
+  movx                 m1, [srcq+src_strideq+1]
+  pavgb                m2, m3
+  pavgb                m4, m1
+  pavgb                m0, m2
+  pavgb                m2, m4
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  punpcklbw            m0, m5
+  punpcklbw            m2, m5
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_half_loop
+  STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+  ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else  ;x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+  add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+.x_half_y_other_loop:
+  movu                 m4, [srcq]
+  movu                 m2, [srcq+1]
+  mova                 m1, [dstq]
+  pavgb                m4, m2
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  pmullw               m2, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, filter_rnd
+  punpcklbw            m0, m5
+  paddw                m2, m3
+  punpcklbw            m3, m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+%endif
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m3, [srcq+1]
+  add                srcq, src_strideq
+  pavgb                m0, m3
+%if notcpuflag(ssse3)
+  punpcklbw            m0, m5
+%endif
+.x_half_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+  pavgb                m2, m1
+  pavgb                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  movx                 m1, [dstq]
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_y_a
+  pmullw               m1, m2, filter_y_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  paddw                m0, m1
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf:
+  test          y_offsetd, y_offsetd
+  jnz .x_nonhalf_y_nonzero
+
+  ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m4, [srcq+1]
+  mova                 m1, [dstq]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m4, m5
+  punpcklbw            m0, m5
+  punpcklbw            m4, m5
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m2, m3
+  paddw                m0, m4
+%endif
+  psraw                m2, 4
+  psraw                m0, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m2, [srcq+src_strideq]
+  movx                 m4, [srcq+src_strideq+1]
+  movx                 m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  movx                 m1, [dstq]
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_x_a
+  pmaddubsw            m2, filter_x_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  punpcklbw            m2, m5
+  punpcklbw            m4, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m0, m1
+  paddw                m2, filter_rnd
+  movx                 m1, [dstq]
+  paddw                m2, m4
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+  cmp           y_offsetd, 4
+  jne .x_nonhalf_y_nonhalf
+
+  ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else    ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+  add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+  add                srcq, src_strideq
+  packuswb             m0, m2
+.x_other_y_half_loop:
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+%if cpuflag(ssse3)
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  pavgb                m0, m4
+  punpckhbw            m3, m1, m5
+  punpcklbw            m1, m5
+%else
+  punpckhbw            m2, m4, m5
+  punpckhbw            m1, m3, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m4, m3
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  psraw                m4, 4
+  psraw                m2, 4
+  punpckhbw            m3, m1, m5
+  ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+  ; have a 1-register shortage to be able to store the backup of the bilin
+  ; filtered second line as words as cache for the next line. Packing into
+  ; a byte costs 1 pack and 2 unpacks, but saves a register.
+  packuswb             m4, m2
+  punpcklbw            m1, m5
+  pavgb                m0, m4
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  pavgb                m0, [secq]
+%endif
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  add                srcq, src_strideq
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  add                srcq, src_strideq
+  psraw                m0, 4
+.x_other_y_half_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+  movx                 m4, [srcq+src_strideq]
+  movx                 m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m1, [dstq]
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  paddw                m4, m3
+  movx                 m3, [dstq+dst_strideq]
+%endif
+  psraw                m2, 4
+  psraw                m4, 4
+  pavgw                m0, m2
+  pavgw                m2, m4
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  lea                srcq, [srcq+src_strideq*2]
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+  STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if AOM_ARCH_X86_64
+  lea        bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+  shl           x_offsetd, filter_idx_shift
+  shl           y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+  mova                 m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                 m9, [bilin_filter+x_offsetq+16]
+%endif
+  mova                m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+  mova                m11, [bilin_filter+y_offsetq+16]
+%endif
+  mova                m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else   ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+  mov tempq, g_bilin_filterm
+  add           x_offsetq, tempq
+  add           y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+  mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+  add           x_offsetq, bilin_filter
+  add           y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+  ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+  movu                 m0, [srcq]
+  movu                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpckhbw            m2, m0, m1
+  punpcklbw            m0, m1
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m0, filter_x_a
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+%else
+  punpckhbw            m2, m0, m5
+  punpckhbw            m3, m1, m5
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m2, filter_rnd
+  paddw                m0, m1
+  paddw                m2, m3
+%endif
+  psraw                m0, 4
+  psraw                m2, 4
+
+  INC_SRC_BY_SRC_STRIDE
+
+  packuswb             m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+  movu                 m4, [srcq]
+  movu                 m3, [srcq+1]
+  mova                 m1, [dstq]
+  punpckhbw            m2, m4, m3
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  punpckhbw            m3, m1, m5
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m4, m2
+  punpckhbw            m2, m0, m4
+  punpcklbw            m0, m4
+  pmaddubsw            m2, filter_y_a
+  pmaddubsw            m0, filter_y_a
+  punpcklbw            m1, m5
+  paddw                m2, filter_rnd
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  psraw                m0, 4
+%else
+  movu                 m3, [srcq]
+  movu                 m4, [srcq+1]
+  punpckhbw            m1, m3, m5
+  punpckhbw            m2, m4, m5
+  punpcklbw            m3, m5
+  punpcklbw            m4, m5
+  pmullw               m3, filter_x_a
+  pmullw               m4, filter_x_b
+  paddw                m3, filter_rnd
+  pmullw               m1, filter_x_a
+  pmullw               m2, filter_x_b
+  paddw                m1, filter_rnd
+  paddw                m3, m4
+  paddw                m1, m2
+  psraw                m3, 4
+  psraw                m1, 4
+  packuswb             m4, m3, m1
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+  pmullw               m2, filter_y_a
+  pmullw               m1, filter_y_b
+  paddw                m2, filter_rnd
+  pmullw               m0, filter_y_a
+  pmullw               m3, filter_y_b
+  paddw                m2, m1
+  mova                 m1, [dstq]
+  paddw                m0, filter_rnd
+  psraw                m2, 4
+  paddw                m0, m3
+  punpckhbw            m3, m1, m5
+  psraw                m0, 4
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+  packuswb             m0, m2
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  add                dstq, dst_strideq
+%else ; %1 < 16
+  movx                 m0, [srcq]
+  movx                 m1, [srcq+1]
+%if cpuflag(ssse3)
+  punpcklbw            m0, m1
+  pmaddubsw            m0, filter_x_a
+  paddw                m0, filter_rnd
+%else
+  punpcklbw            m0, m5
+  punpcklbw            m1, m5
+  pmullw               m0, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m0, filter_rnd
+  paddw                m0, m1
+%endif
+  psraw                m0, 4
+%if cpuflag(ssse3)
+  packuswb             m0, m0
+%endif
+
+  INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+  movx                 m2, [srcq]
+  movx                 m1, [srcq+1]
+
+  INC_SRC_BY_SRC_STRIDE
+  movx                 m4, [srcq]
+  movx                 m3, [srcq+1]
+
+%if cpuflag(ssse3)
+  punpcklbw            m2, m1
+  punpcklbw            m4, m3
+  pmaddubsw            m2, filter_x_a
+  pmaddubsw            m4, filter_x_a
+  movx                 m3, [dstq+dst_strideq]
+  movx                 m1, [dstq]
+  paddw                m2, filter_rnd
+  paddw                m4, filter_rnd
+  psraw                m2, 4
+  psraw                m4, 4
+  packuswb             m2, m2
+  packuswb             m4, m4
+  punpcklbw            m0, m2
+  punpcklbw            m2, m4
+  pmaddubsw            m0, filter_y_a
+  pmaddubsw            m2, filter_y_a
+  punpcklbw            m3, m5
+  paddw                m0, filter_rnd
+  paddw                m2, filter_rnd
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m1, m5
+%else
+  punpcklbw            m2, m5
+  punpcklbw            m1, m5
+  punpcklbw            m4, m5
+  punpcklbw            m3, m5
+  pmullw               m2, filter_x_a
+  pmullw               m1, filter_x_b
+  paddw                m2, filter_rnd
+  pmullw               m4, filter_x_a
+  pmullw               m3, filter_x_b
+  paddw                m4, filter_rnd
+  paddw                m2, m1
+  paddw                m4, m3
+  psraw                m2, 4
+  psraw                m4, 4
+  pmullw               m0, filter_y_a
+  pmullw               m3, m2, filter_y_b
+  paddw                m0, filter_rnd
+  pmullw               m2, filter_y_a
+  pmullw               m1, m4, filter_y_b
+  paddw                m2, filter_rnd
+  paddw                m0, m3
+  movx                 m3, [dstq+dst_strideq]
+  paddw                m2, m1
+  movx                 m1, [dstq]
+  psraw                m0, 4
+  psraw                m2, 4
+  punpcklbw            m3, m5
+  punpcklbw            m1, m5
+%endif
+%if %2 == 1 ; avg
+  ; FIXME(rbultje) pipeline
+%if %1 == 4
+  movlhps              m0, m2
+%endif
+  packuswb             m0, m2
+%if %1 > 4
+  pavgb                m0, [secq]
+  punpckhbw            m2, m0, m5
+  punpcklbw            m0, m5
+%else
+  movh                 m2, [secq]
+  pavgb                m0, m2
+  punpcklbw            m0, m5
+  movhlps              m2, m0
+%endif
+%endif
+  SUM_SSE              m0, m1, m2, m3, m6, m7
+  mova                 m0, m4
+
+  INC_SRC_BY_SRC_STRIDE
+  lea                dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+  add                secq, sec_str
+%endif
+  dec                   block_height
+  jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+  STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4
+SUBPEL_VARIANCE  8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE  4, 1
+SUBPEL_VARIANCE  8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..b4c5cc7c7b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+                                   const uint8_t *pred_ptr) {
+  __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+  __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+  __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01);
+  __m256i diff0 = _mm256_unpacklo_epi8(s, p);
+  __m256i diff1 = _mm256_unpackhi_epi8(s, p);
+  diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone);
+  diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone);
+  _mm256_store_si256((__m256i *)(diff_ptr),
+                     _mm256_permute2x128_si256(diff0, diff1, 0x20));
+  _mm256_store_si256((__m256i *)(diff_ptr + 16),
+                     _mm256_permute2x128_si256(diff0, diff1, 0x31));
+}
+
+static INLINE void subtract_block_16xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+    __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+    __m256i s_0 = _mm256_cvtepu8_epi16(s);
+    __m256i p_0 = _mm256_cvtepu8_epi16(p);
+    const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+    _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_32xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_64xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+static INLINE void subtract_block_128xn_avx2(
+    int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+  for (int32_t j = 0; j < rows; ++j) {
+    subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+    subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+    subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+    subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+    src_ptr += src_stride;
+    pred_ptr += pred_stride;
+    diff_ptr += diff_stride;
+  }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+                             ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                             ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                             ptrdiff_t pred_stride) {
+  switch (cols) {
+    case 16:
+      subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 32:
+      subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 64:
+      subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+                               pred_ptr, pred_stride);
+      break;
+    case 128:
+      subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+                                src_stride, pred_ptr, pred_stride);
+      break;
+    default:
+      aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+                              src_stride, pred_ptr, pred_stride);
+      break;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..fd508c0916
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+;                         int16_t *diff, ptrdiff_t diff_stride,
+;                         const uint8_t *src, ptrdiff_t src_stride,
+;                         const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+                        rows, cols, diff, diff_stride, src, src_stride, \
+                        pred, pred_stride
+%define pred_str colsq
+  pxor                  m7, m7         ; dedicated zero register
+  cmp                colsd, 4
+  je .case_4
+  cmp                colsd, 8
+  je .case_8
+  cmp                colsd, 16
+  je .case_16
+  cmp                colsd, 32
+  je .case_32
+  cmp                colsd, 64
+  je .case_64
+
+%macro loop16 6
+  mova                  m0, [srcq+%1]
+  mova                  m4, [srcq+%2]
+  movu                  m1, [predq+%3]
+  movu                  m5, [predq+%4]
+  punpckhbw             m2, m0, m7
+  punpckhbw             m3, m1, m7
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  psubw                 m2, m3
+  psubw                 m0, m1
+  punpckhbw             m1, m4, m7
+  punpckhbw             m3, m5, m7
+  punpcklbw             m4, m7
+  punpcklbw             m5, m7
+  psubw                 m1, m3
+  psubw                 m4, m5
+  mova [diffq+mmsize*0+%5], m0
+  mova [diffq+mmsize*1+%5], m2
+  mova [diffq+mmsize*0+%6], m4
+  mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+  mov             pred_str, pred_stridemp
+.loop_64:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_64
+  RET
+
+.case_32:
+  mov             pred_str, pred_stridemp
+.loop_32:
+  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  dec                rowsd
+  jg .loop_32
+  RET
+
+.case_16:
+  mov             pred_str, pred_stridemp
+.loop_16:
+  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                predq, [predq+pred_str*2]
+  lea                 srcq, [srcq+src_strideq*2]
+  sub                rowsd, 2
+  jg .loop_16
+  RET
+
+%macro loop_h 0
+  movh                  m0, [srcq]
+  movh                  m2, [srcq+src_strideq]
+  movh                  m1, [predq]
+  movh                  m3, [predq+pred_str]
+  punpcklbw             m0, m7
+  punpcklbw             m1, m7
+  punpcklbw             m2, m7
+  punpcklbw             m3, m7
+  psubw                 m0, m1
+  psubw                 m2, m3
+  mova             [diffq], m0
+  mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+  mov             pred_str, pred_stridemp
+.loop_8:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_8
+  RET
+
+INIT_MMX
+.case_4:
+  mov             pred_str, pred_stridemp
+.loop_4:
+  loop_h
+  lea                diffq, [diffq+diff_strideq*4]
+  lea                 srcq, [srcq+src_strideq*2]
+  lea                predq, [predq+pred_str*2]
+  sub                rowsd, 2
+  jg .loop_4
+  emms
+  RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 0000000000..89b9b824bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                                int width, int height) {
+  uint64_t result;
+  __m256i v_acc_q = _mm256_setzero_si256();
+  const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_acc_d = _mm256_setzero_si256();
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+    }
+    v_acc_q =
+        _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+    src += 4 * stride;
+  }
+  __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+  __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+  __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+  result_64_2_int = _mm_add_epi64(
+      result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+  xx_storel_64(&result, result_64_2_int);
+
+  return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+                                     int height) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride,
+                                            int width, int height, int *sum) {
+  uint64_t result;
+  const __m256i zero_reg = _mm256_setzero_si256();
+  const __m256i one_reg = _mm256_set1_epi16(1);
+
+  __m256i v_sse_total = zero_reg;
+  __m256i v_sum_total = zero_reg;
+
+  for (int col = 0; col < height; col += 4) {
+    __m256i v_sse_row = zero_reg;
+    for (int row = 0; row < width; row += 16) {
+      const int16_t *tempsrc = src + row;
+      const __m256i v_val_0_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+      const __m256i v_val_1_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+      const __m256i v_val_2_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+      const __m256i v_val_3_w =
+          _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+      const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w);
+      const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w);
+      __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg);
+      v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123);
+
+      const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d);
+    }
+    const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+  }
+
+  const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total);
+  const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1);
+  __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low);
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8));
+  sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4));
+  *sum += _mm_cvtsi128_si32(sum_128bit);
+
+  __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total);
+  __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1);
+  __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi);
+
+  sse_128bit =
+      _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit));
+
+  xx_storel_64(&result, sse_128bit);
+
+  return result;
+}
+
+uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+    return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 2);
+  vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+  return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+  __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+  __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 8);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  vtmp2 = _mm_srli_si128(vtmp1, 4);
+  vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+  return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 32 elements in a row
+  for (i = 0; i < width - 31; i += 32) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+        __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi16(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+      __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi16(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m256i vzero = _mm256_setzero_si256();
+  __m256i v_acc_sum = vzero;
+  __m256i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m256i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+        __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+        __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm256_accumulate_epi32(v_acc_sum);
+      ss += mm256_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+      __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+      __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+      __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm256_accumulate_epi32(v_acc_sum);
+    ss += mm256_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000000..cf3ed98974
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+  const __m128d ad = _mm_castsi128_pd(a);
+  return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if AOM_ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i *)&tmp, a);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+  return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
+  const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+  __m128i v_sum_d =
+      _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+  v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) {
+  const __m128i one_reg = _mm_set1_epi16(1);
+  const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+  const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+  __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+  __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+
+  __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w);
+  v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8));
+  v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4));
+  *sum = _mm_cvtsi128_si32(v_sum_0123_d);
+
+  const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+  const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+  __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8));
+  v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4));
+  return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height) {
+  int r = 0;
+  __m128i v_acc_q = _mm_setzero_si128();
+  do {
+    const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+    v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+                                   _mm_and_si128(v_acc_q, v_zext_mask_q));
+  v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+  return xx_cvtsi128_si64(v_acc_64);
+}
+
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum) {
+  int r = 0;
+  uint64_t sse = 0;
+  do {
+    int curr_sum = 0;
+    sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum);
+    *sum += curr_sum;
+    src += stride << 2;
+    r += 4;
+  } while (r < height);
+  return sse;
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                int height) {
+  int r = 0;
+
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  do {
+    __m128i v_acc_d = _mm_setzero_si128();
+    int c = 0;
+    do {
+      const int16_t *b = src + c;
+
+      const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      c += 8;
+    } while (c < width);
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+  return xx_cvtsi128_si64(v_acc_q);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                            int height, int *sum) {
+  int r = 0;
+  uint64_t result;
+  const __m128i zero_reg = _mm_setzero_si128();
+  const __m128i one_reg = _mm_set1_epi16(1);
+
+  __m128i v_sse_total = zero_reg;
+  __m128i v_sum_total = zero_reg;
+
+  do {
+    int c = 0;
+    __m128i v_sse_row = zero_reg;
+    do {
+      const int16_t *b = src + c;
+
+      __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+      __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+      __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+      __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+      v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d);
+
+      const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w);
+      const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w);
+      __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23);
+      v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+      v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d);
+
+      c += 8;
+    } while (c < width);
+
+    const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg);
+    const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg);
+    v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi);
+    v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row);
+    src += 4 * stride;
+    r += 4;
+  } while (r < height);
+
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8));
+  v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4));
+  *sum += _mm_cvtsi128_si32(v_sum_total);
+
+  v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8));
+  xx_storel_64(&result, v_sse_total);
+  return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+                                     int height) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+  } else {
+    return aom_sum_squares_2d_i16_c(src, stride, width, height);
+  }
+}
+
+uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
+                                 int height, int *sum) {
+  if (LIKELY(width == 4 && height == 4)) {
+    return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+  } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+    return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+  } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+    // Generic case
+    return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+  } else {
+    return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+  const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+  __m128i v_acc0_q = _mm_setzero_si128();
+  __m128i v_acc1_q = _mm_setzero_si128();
+
+  const int16_t *const end = src + n;
+
+  assert(n % 64 == 0);
+
+  while (src < end) {
+    const __m128i v_val_0_w = xx_load_128(src);
+    const __m128i v_val_1_w = xx_load_128(src + 8);
+    const __m128i v_val_2_w = xx_load_128(src + 16);
+    const __m128i v_val_3_w = xx_load_128(src + 24);
+    const __m128i v_val_4_w = xx_load_128(src + 32);
+    const __m128i v_val_5_w = xx_load_128(src + 40);
+    const __m128i v_val_6_w = xx_load_128(src + 48);
+    const __m128i v_val_7_w = xx_load_128(src + 56);
+
+    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+    src += 64;
+  }
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+  return xx_cvtsi128_si64(v_acc0_q);
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+  if (n % 64 == 0) {
+    return aom_sum_squares_i16_64n_sse2(src, n);
+  } else if (n > 64) {
+    const uint32_t k = n & ~63u;
+    return aom_sum_squares_i16_64n_sse2(src, k) +
+           aom_sum_squares_i16_c(src + k, n - k);
+  } else {
+    return aom_sum_squares_i16_c(src, n);
+  }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 2);
+  vec_a = _mm_add_epi16(vec_a, vtmp);
+  return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+  __m128i vtmp = _mm_srli_si128(vec_a, 8);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  vtmp = _mm_srli_si128(vec_a, 4);
+  vec_a = _mm_add_epi32(vec_a, vtmp);
+  return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+                            int height) {
+  uint8_t *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 16 elements in a row
+  for (i = 0; i < width - 15; i += 16) {
+    srcp = src + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 7; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+        v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+        __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi16(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+      v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+      __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi16(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = src;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint8_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+                             int height) {
+  uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+  uint64_t s = 0, ss = 0;
+  __m128i vzero = _mm_setzero_si128();
+  __m128i v_acc_sum = vzero;
+  __m128i v_acc_sqs = vzero;
+  int i, j;
+
+  // Process 8 elements in a row
+  for (i = 0; i < width - 8; i += 8) {
+    srcp = srcp1 + i;
+    // Process 8 columns at a time
+    for (j = 0; j < height - 8; j += 8) {
+      __m128i vsrc[8];
+      for (int k = 0; k < 8; k++) {
+        vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+        srcp += src_stride;
+      }
+      for (int k = 0; k < 8; k++) {
+        __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+        __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+        v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+        v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+        __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+        v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      }
+
+      // Update total sum and clear the vectors
+      s += mm_accumulate_epi32(v_acc_sum);
+      ss += mm_accumulate_epi32(v_acc_sqs);
+      v_acc_sum = vzero;
+      v_acc_sqs = vzero;
+    }
+
+    // Process remaining rows (height not a multiple of 8)
+    for (; j < height; j++) {
+      __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+      __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+      __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+      v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+      v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+      __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+      v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+      srcp += src_stride;
+    }
+
+    // Update total sum and clear the vectors
+    s += mm_accumulate_epi32(v_acc_sum);
+    ss += mm_accumulate_epi32(v_acc_sqs);
+    v_acc_sum = vzero;
+    v_acc_sqs = vzero;
+  }
+
+  // Process the remaining area using C
+  srcp = srcp1;
+  for (int k = 0; k < height; k++) {
+    for (int m = i; m < width; m++) {
+      uint16_t val = srcp[m];
+      s += val;
+      ss += val * val;
+    }
+    srcp += src_stride;
+  }
+  return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 0000000000..5ed3f2c7bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+                                         int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+                                         int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum);
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+                                     int *sum);
+uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+                                     int height, int *sum);
+
+#endif  // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
new file mode 100644
index 0000000000..6744ec51d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  int val;
+  memcpy(&val, a, sizeof(val));
+  return _mm_cvtsi32_si128(val);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  const int val = _mm_cvtsi128_si32(v);
+  memcpy(a, &val, sizeof(val));
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i *)a, v);
+}
+
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, e1, 0, e0);
+#else
+  return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+  return _mm_set_epi32(0, a, 0, a);
+#else
+  return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Fill an SSE register using an interleaved pair of values, ie. set the
+// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
+// as when a register is stored to / loaded from memory.
+//
+// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
+// instruction
+static INLINE __m128i xx_set2_epi16(int16_t a, int16_t b) {
+  return _mm_setr_epi16(a, b, a, b, a, b, a, b);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 0000000000..b729e5f410
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+  return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+  _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+  _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+  return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+  return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+  return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+  return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+  _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+  _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+  const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+  return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif  // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..9dab750f44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+  // Unpack 16 bit elements resulting in:
+  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
+  return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+                                      __m128i *const out) {
+  // Unpack 8 bit elements. Goes from:
+  // in[0]: 00 01 02 03 04 05 06 07
+  // in[1]: 10 11 12 13 14 15 16 17
+  // in[2]: 20 21 22 23 24 25 26 27
+  // in[3]: 30 31 32 33 34 35 36 37
+  // in[4]: 40 41 42 43 44 45 46 47
+  // in[5]: 50 51 52 53 54 55 56 57
+  // in[6]: 60 61 62 63 64 65 66 67
+  // in[7]: 70 71 72 73 74 75 76 77
+  // to:
+  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
+  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
+  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
+  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+  // Unpack 16 bit elements resulting in:
+  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
+  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
+  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
+  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+  // Unpack 32 bit elements resulting in:
+  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
+  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
+  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
+  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
+  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30 40 50 60 70
+  // out[1]: 01 11 21 31 41 51 61 71
+  // out[2]: 02 12 22 32 42 52 62 72
+  // out[3]: 03 13 23 33 43 53 63 73
+  // out[4]: 04 14 24 34 44 54 64 74
+  // out[5]: 05 15 25 35 45 55 65 75
+  // out[6]: 06 16 26 36 46 56 66 76
+  // out[7]: 07 17 27 37 47 57 67 77
+  out[0] = _mm_unpacklo_epi64(c0, c0);
+  out[1] = _mm_unpackhi_epi64(c0, c0);
+  out[2] = _mm_unpacklo_epi64(c1, c1);
+  out[3] = _mm_unpackhi_epi64(c1, c1);
+  out[4] = _mm_unpacklo_epi64(c2, c2);
+  out[5] = _mm_unpackhi_epi64(c2, c2);
+  out[6] = _mm_unpacklo_epi64(c3, c3);
+  out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // out[0]: 00 10 20 30  01 11 21 31
+  // out[1]: 01 11 21 31  __ __ __ __
+  // out[2]: 02 12 22 32  03 13 23 33
+  // out[3]: 03 13 23 33  __ __ __ __
+  //
+  // Note: The high 64 bits of the output registers are shown for informational
+  // purposes only. Callers should only use the low 64 bits of the output
+  // registers. "__" indicates zeros.
+  out[0] = _mm_unpacklo_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  XX XX XX XX
+  // in[1]: 10 11 12 13  XX XX XX XX
+  // in[2]: 20 21 22 23  XX XX XX XX
+  // in[3]: 30 31 32 33  XX XX XX XX
+  // in[4]: 40 41 42 43  XX XX XX XX
+  // in[5]: 50 51 52 53  XX XX XX XX
+  // in[6]: 60 61 62 63  XX XX XX XX
+  // in[7]: 70 71 72 73  XX XX XX XX
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 02 12 22 32  03 13 23 33
+  // b3: 42 52 62 72  43 53 63 73
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b2, b3);
+  out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+  // in[4]: 40 41 42 43  44 45 46 47
+  // in[5]: 50 51 52 53  54 55 56 57
+  // in[6]: 60 61 62 63  64 65 66 67
+  // in[7]: 70 71 72 73  74 75 76 77
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a2:    40 50 41 51  42 52 43 53
+  // a3:    60 70 61 71  62 72 63 73
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  // a6:    44 54 45 55  46 56 47 57
+  // a7:    64 74 65 75  66 76 67 77
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b1: 40 50 60 70  41 51 61 71
+  // b2: 04 14 24 34  05 15 25 35
+  // b3: 44 54 64 74  45 55 65 75
+  // b4: 02 12 22 32  03 13 23 33
+  // b5: 42 52 62 72  43 53 63 73
+  // b6: 06 16 26 36  07 17 27 37
+  // b7: 46 56 66 76  47 57 67 77
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  40 50 60 70
+  // out[1]: 01 11 21 31  41 51 61 71
+  // out[2]: 02 12 22 32  42 52 62 72
+  // out[3]: 03 13 23 33  43 53 63 73
+  // out[4]: 04 14 24 34  44 54 64 74
+  // out[5]: 05 15 25 35  45 55 65 75
+  // out[6]: 06 16 26 36  46 56 66 76
+  // out[7]: 07 17 27 37  47 57 67 77
+  out[0] = _mm_unpacklo_epi64(b0, b1);
+  out[1] = _mm_unpackhi_epi64(b0, b1);
+  out[2] = _mm_unpacklo_epi64(b4, b5);
+  out[3] = _mm_unpackhi_epi64(b4, b5);
+  out[4] = _mm_unpacklo_epi64(b2, b3);
+  out[5] = _mm_unpackhi_epi64(b2, b3);
+  out[6] = _mm_unpacklo_epi64(b6, b7);
+  out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+                                         __m128i *const right) {
+  __m128i tbuf[8];
+  transpose_16bit_8x8(left, left);
+  transpose_16bit_8x8(right, tbuf);
+  transpose_16bit_8x8(left + 8, right);
+  transpose_16bit_8x8(right + 8, right + 8);
+
+  left[8] = tbuf[0];
+  left[9] = tbuf[1];
+  left[10] = tbuf[2];
+  left[11] = tbuf[3];
+  left[12] = tbuf[4];
+  left[13] = tbuf[5];
+  left[14] = tbuf[6];
+  left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+                                         __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 10 11 12 13
+  // in[2]: 20 21 22 23
+  // in[3]: 30 31 32 33
+  // in[4]: 04 05 06 07
+  // in[5]: 14 15 16 17
+  // in[6]: 24 25 26 27
+  // in[7]: 34 35 36 37
+  // to:
+  // a0:    00 10 01 11
+  // a1:    20 30 21 31
+  // a2:    02 12 03 13
+  // a3:    22 32 23 33
+  // a4:    04 14 05 15
+  // a5:    24 34 25 35
+  // a6:    06 16 07 17
+  // a7:    26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 32 bit elements. Goes from:
+  // in[0]: 00 01 02 03
+  // in[1]: 04 05 06 07
+  // in[2]: 10 11 12 13
+  // in[3]: 14 15 16 17
+  // in[4]: 20 21 22 23
+  // in[5]: 24 25 26 27
+  // in[6]: 30 31 32 33
+  // in[7]: 34 35 36 37
+  // to:
+  // a0: 00 10 01 11
+  // a1: 20 30 21 31
+  // a2: 02 12 03 13
+  // a3: 22 32 23 33
+  // a4: 04 14 05 15
+  // a5: 24 34 25 35
+  // a6: 06 16 07 17
+  // a7: 26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
+  // out[4]: 04 14 24 34
+  // out[5]: 05 15 25 35
+  // out[6]: 06 16 26 36
+  // out[7]: 07 17 27 37
+  out[0] = _mm_unpacklo_epi64(a0, a1);
+  out[1] = _mm_unpackhi_epi64(a0, a1);
+  out[2] = _mm_unpacklo_epi64(a2, a3);
+  out[3] = _mm_unpackhi_epi64(a2, a3);
+  out[4] = _mm_unpacklo_epi64(a4, a5);
+  out[5] = _mm_unpackhi_epi64(a4, a5);
+  out[6] = _mm_unpacklo_epi64(a6, a7);
+  out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 0000000000..4105250bc0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+  return _mm256_set1_epi32(
+      (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+                                   __m256i *in0, __m256i *in1, const __m256i _r,
+                                   const int32_t cos_bit) {
+  __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+  __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+  __m256i u0 = _mm256_madd_epi16(t0, w0);
+  __m256i u1 = _mm256_madd_epi16(t1, w0);
+  __m256i v0 = _mm256_madd_epi16(t0, w1);
+  __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+  __m256i a0 = _mm256_add_epi32(u0, _r);
+  __m256i a1 = _mm256_add_epi32(u1, _r);
+  __m256i b0 = _mm256_add_epi32(v0, _r);
+  __m256i b1 = _mm256_add_epi32(v1, _r);
+
+  __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+  __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+  __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+  __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+  *in0 = _mm256_packs_epi32(c0, c1);
+  *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_adds_epi16(_in0, _in1);
+  *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+  const __m256i _in0 = *in0;
+  const __m256i _in1 = *in1;
+  *in0 = _mm256_add_epi32(_in0, _in1);
+  *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+                                             __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_adds_epi16(_in0, _in1);
+  *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+                                           __m256i in0, __m256i in1) {
+  const __m256i _in0 = in0;
+  const __m256i _in1 = in1;
+  *out0 = _mm256_add_epi32(_in0, _in1);
+  *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+  return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+                                                   int stride, __m256i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+                                                        int stride,
+                                                        __m256i *out,
+                                                        int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+  }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+  const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+  const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+  return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+                                                       int stride, __m256i *out,
+                                                       int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+  }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+                                       __m256i *const out) {
+  __m256i t[16], u[16];
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 1)   ==>  (0, 1)
+  //   (2, 3)   ==>  (2, 3)
+  //   (4, 5)   ==>  (4, 5)
+  //   (6, 7)   ==>  (6, 7)
+  for (int i = 0; i < 4; i++) {
+    t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+    t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 2)   ==>  (0, 2)
+  //   (1, 3)   ==>  (1, 3)
+  //   (4, 6)   ==>  (4, 6)
+  //   (5, 7)   ==>  (5, 7)
+  for (int i = 0; i < 2; i++) {
+    u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+    u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+    u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+    u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+  }
+
+  // (1st, 2nd) ==> (lo, hi)
+  //   (0, 4)   ==>  (0, 1)
+  //   (1, 5)   ==>  (4, 5)
+  //   (2, 6)   ==>  (2, 3)
+  //   (3, 7)   ==>  (6, 7)
+  for (int i = 0; i < 2; i++) {
+    out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+    out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+    out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+    out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+  }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+                                              __m256i *const out) {
+  __m256i t[16];
+
+#define LOADL(idx)                                                            \
+  t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+  t[idx] = _mm256_inserti128_si256(                                           \
+      t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx)                                                           \
+  t[8 + idx] =                                                               \
+      _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+  t[8 + idx] = _mm256_inserti128_si256(                                      \
+      t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+  // load left 8x16
+  LOADL(0)
+  LOADL(1)
+  LOADL(2)
+  LOADL(3)
+  LOADL(4)
+  LOADL(5)
+  LOADL(6)
+  LOADL(7)
+
+  // load right 8x16
+  LOADR(0)
+  LOADR(1)
+  LOADR(2)
+  LOADR(3)
+  LOADR(4)
+  LOADR(5)
+  LOADR(6)
+  LOADR(7)
+
+  // get the top 16x8 result
+  transpose2_8x8_avx2(t, out);
+  // get the bottom 16x8 result
+  transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+                                             __m256i *const out) {
+  const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+  const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+  const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+  const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+  const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+  const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+  const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+  const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+  const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+  const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+  const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+  const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+  const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+  const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+  const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+  const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+  out[0] = _mm256_unpacklo_epi64(b0, b1);
+  out[1] = _mm256_unpackhi_epi64(b0, b1);
+  out[2] = _mm256_unpacklo_epi64(b4, b5);
+  out[3] = _mm256_unpackhi_epi64(b4, b5);
+  out[4] = _mm256_unpacklo_epi64(b2, b3);
+  out[5] = _mm256_unpackhi_epi64(b2, b3);
+  out[6] = _mm256_unpacklo_epi64(b6, b7);
+  out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+  if (bit < 0) {
+    bit = -bit;
+    __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_adds_epi16(in[i], round);
+      in[i] = _mm256_srai_epi16(in[i], bit);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm256_slli_epi16(in[i], bit);
+    }
+  }
+}
+
+static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
+  __m256i tmp, round;
+  round = _mm256_set1_epi32(1 << (bit - 1));
+  tmp = _mm256_add_epi32(vec, round);
+  return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+                                             const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_avx2(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm256_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+                                                  __m256i *output,
+                                                  const int size, const int bit,
+                                                  const int val) {
+  const __m256i sqrt2 = _mm256_set1_epi32(val);
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = round_shift_32_avx2(input[i], bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+      const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+      output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+    }
+  }
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+  const __m256i scale_rounding =
+      pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+  const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+  return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+                                                     int32_t *const b) {
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+  const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+  const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+  const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+  _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+  _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+  _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+    const __m256i *const in, int32_t *const out, const int stride,
+    const int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+  }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+                            __m256i *out) {
+  out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+  out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+  out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+  out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+  out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+  out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+  out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+  out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+  out1[0] = _mm256_castsi256_si128(in[0]);
+  out1[1] = _mm256_castsi256_si128(in[1]);
+  out1[2] = _mm256_castsi256_si128(in[2]);
+  out1[3] = _mm256_castsi256_si128(in[3]);
+  out1[4] = _mm256_castsi256_si128(in[4]);
+  out1[5] = _mm256_castsi256_si128(in[5]);
+  out1[6] = _mm256_castsi256_si128(in[6]);
+  out1[7] = _mm256_castsi256_si128(in[7]);
+
+  out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+  out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+  out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+  out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+  out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+  out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+  out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+  out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..9c99eb93bd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+  _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h)                           \
+  _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+                 (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif  // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..046d6f10f8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+  return _mm_add_epi16(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+  return _mm_add_epi32(_mm256_castsi256_si128(val),
+                       _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+                                        __m256i *const sse,
+                                        __m256i *const sum) {
+  const __m256i adj_sub = _mm256_set1_epi16((short)0xff01);  // (1,-1)
+
+  // unpack into pairs of source and reference values
+  const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+  const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+  // subtract adjacent elements using src*1 + ref*-1
+  const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+  const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+  const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+  const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+  // add to the running totals
+  *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+  *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+                                                     unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+  // unpack sse and sum registers and add
+  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+  // perform the final summation and extract the results
+  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+  *((int *)sse) = _mm_cvtsi128_si32(res);
+  return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+                                          unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+  const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+  return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  // extract the low lane and add it to the high lane
+  const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+  const __m128i vsum_64 =
+      _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+                    _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+  const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+  const __m256i sum_hi =
+      _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+  return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+                                           unsigned int *const sse) {
+  vsum = sum_to_32bit_avx2(vsum);
+  const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+  return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+    const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+    const int ref_stride, __m256i *const sse, __m256i *const sum) {
+  const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+  const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+  const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+  const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m256i *const sse,
+                                          __m256i *const sum) {
+  const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+  variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i += 2) {
+    variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src, ref, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m256i *const vsse,
+                                   __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m256i *const vsse,
+                                    __m256i *const vsum) {
+  *vsum = _mm256_setzero_si256();
+
+  for (int i = 0; i < h; i++) {
+    variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+    variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+    variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+    variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel)                         \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum;                                                             \
+    variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse);       \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048)
+
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512)
+#endif
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_avx2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i vsse = _mm256_setzero_si256();                                    \
+    __m256i vsum = _mm256_setzero_si256();                                    \
+    for (int i = 0; i < (bh / uh); i++) {                                     \
+      __m256i vsum16;                                                         \
+      variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16));               \
+      src += uh * src_stride;                                                 \
+      ref += uh * ref_stride;                                                 \
+    }                                                                         \
+    const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);                     \
+    const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);  \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> bits);               \
+  }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32)    // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32)   // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16)   // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16)  // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+  const __m256i d =
+      _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+  return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+                                            const __m256i a,
+                                            uint8_t *comp_pred) {
+  const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+  const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+  const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+  const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+  const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+  const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+  const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+  const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+  const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+  const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+  const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+  const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+  const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+  _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride) {
+  int row = 0;
+  if (width == 8) {
+    do {
+      const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred));
+      const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref));
+      const __m128i ref_1 =
+          _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+      const __m128i ref_2 =
+          _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+      const __m128i ref_3 =
+          _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride));
+      const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1);
+      const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3);
+
+      const __m256i ref_0123 =
+          _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1);
+      const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123);
+      _mm256_storeu_si256((__m256i *)(comp_pred), average);
+
+      row += 4;
+      pred += 32;
+      comp_pred += 32;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 16) {
+    do {
+      const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+      const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+      const __m256i tmp0 =
+          _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+      const __m256i ref_0 = _mm256_inserti128_si256(
+          tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+      const __m256i tmp1 = _mm256_castsi128_si256(
+          _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+      const __m256i ref_1 = _mm256_inserti128_si256(
+          tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 4;
+      pred += 64;
+      comp_pred += 64;
+      ref += 4 * ref_stride;
+    } while (row < height);
+  } else if (width == 32) {
+    do {
+      const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+      const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+      const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref));
+      const __m256i ref_1 =
+          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+      const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+      const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+      _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+      _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+      row += 2;
+      pred += 64;
+      comp_pred += 64;
+      ref += 2 * ref_stride;
+    } while (row < height);
+  } else if (width % 64 == 0) {
+    do {
+      for (int x = 0; x < width; x += 64) {
+        const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x));
+        const __m256i pred_1 =
+            _mm256_loadu_si256((const __m256i *)(pred + x + 32));
+        const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+        const __m256i ref_1 =
+            _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+        const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+        const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+        _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0);
+        _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1);
+      }
+      row++;
+      pred += width;
+      comp_pred += width;
+      ref += ref_stride;
+    } while (row < height);
+  } else {
+    aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride);
+  }
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+                             int height, const uint8_t *ref, int ref_stride,
+                             const uint8_t *mask, int mask_stride,
+                             int invert_mask) {
+  int i = 0;
+  const uint8_t *src0 = invert_mask ? pred : ref;
+  const uint8_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  if (width == 8) {
+    comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+                           mask, mask_stride);
+  } else if (width == 16) {
+    do {
+      const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+      const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+      const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      // comp_pred's stride == width == 16
+      comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+      comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+      comp_pred += (16 << 2);
+      i += 4;
+    } while (i < height);
+  } else {
+    do {
+      for (int x = 0; x < width; x += 32) {
+        const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
+        const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
+        const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
+
+        comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+        comp_pred += 32;
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      i++;
+    } while (i < height);
+  }
+}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+                                                      const __m256i s1,
+                                                      const __m256i a) {
+  const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m256i round_const =
+      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+  const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+  const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+  const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+  const __m256i pred_l = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+  const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+  const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+  const __m256i pred_h = _mm256_srai_epi32(
+      _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+  const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m256i zero = _mm256_setzero_si256();
+
+  if (width == 8) {
+    do {
+      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+      __m256i m = _mm256_castsi128_si256(m_l);
+      m = _mm256_insertf128_si256(m, m_h, 1);
+      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+      _mm_storeu_si128((__m128i *)(comp_pred + width),
+                       _mm256_extractf128_si256(comp, 1));
+
+      src0 += (stride0 << 1);
+      src1 += (stride1 << 1);
+      mask += (mask_stride << 1);
+      comp_pred += (width << 1);
+      i += 2;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+      const __m256i m_16 =
+          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+      _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      for (int x = 0; x < width; x += 32) {
+        const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
+        const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
+        const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
+        const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
+
+        const __m256i m01_16 =
+            _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
+        const __m256i m23_16 = _mm256_cvtepu8_epi16(
+            _mm_loadu_si128((const __m128i *)(mask + x + 16)));
+
+        const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+        const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+        _mm256_storeu_si256((__m256i *)comp_pred, comp);
+        _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+        comp_pred += 32;
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      i += 1;
+    } while (i < height);
+  }
+}
+
+uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
+  __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
+  __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  for (int i = 0; i < h; i += 4) {
+    dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+    dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+    dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
+                                  _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
+    dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
+
+    src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+    src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
+    src1_8x16 =
+        _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    // r15 r14 r13------------r1 r0  - 16 bit
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+    src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+    // accumulation of result
+    square_result = _mm256_add_epi32(square_result, src_16x16);
+  }
+
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
+
+// Compute mse of four consecutive 4x4 blocks.
+// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                     int src_blk_stride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
+  __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = zeros;
+  uint16_t *src_temp = src;
+
+  for (int i = 0; i < h; i += 4) {
+    dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+    dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+    dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride]));
+    dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride]));
+
+    // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30
+    dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+    // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31
+    dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+    // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32
+    dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8);
+    // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33
+    dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+    // All rows of 1st 4x4 block - r00 r01 r02 r03
+    __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+    // All rows of 2nd 4x4 block - r10 r11 r12 r13
+    __m256i src1_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+    // All rows of 3rd 4x4 block - r20 r21 r22 r23
+    __m256i src2_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride]));
+    // All rows of 4th 4x4 block - r30 r31 r32 r33
+    __m256i src3_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride]));
+
+    // r00 r10 r02 r12
+    __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16);
+    // r01 r11 r03 r13
+    __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16);
+    // r20 r30 r22 r32
+    __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16);
+    // r21 r31 r23 r33
+    __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16);
+
+    // r00 r10 r20 r30
+    src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20);
+    // r01 r11 r21 r31
+    src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20);
+    // r02 r12 r22 r32
+    src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31);
+    // r03 r13 r23 r33
+    src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31);
+
+    // r15 r14 r13------------r1 r0  - 16 bit
+    sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16));
+    sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16));
+    sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16));
+    sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0    - 32bit
+    src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+    src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+    src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2);
+    src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3);
+
+    // accumulation of result
+    src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+    src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16);
+    const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16);
+    square_result = _mm256_add_epi32(square_result, square_result_0);
+    src_temp += 16;
+  }
+
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7  s6  s3  s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst3_16x8;
+  __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+    dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
+    dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+    src0_8x16 =
+        _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+    src1_8x16 = _mm256_castsi128_si256(
+        _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+    src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+    // r15 r14 r13 - - - r1 r0 - 16 bit
+    sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+    src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+    // accumulation of result
+    square_result = _mm256_add_epi32(square_result, src_16x16);
+  }
+
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
+
+// Compute mse of two consecutive 8x8 blocks.
+// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                     int src_blk_stride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_16x8, dst1_16x8;
+  __m256i dst0_16x16, dst1_16x16;
+  __m256i res0_4x64, res1_4x64;
+  __m256i sub_result_0, sub_result_1;
+  const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+  __m256i square_result = zeros;
+  uint16_t *src_temp = src;
+
+  for (int i = 0; i < h; i += 2) {
+    dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+    dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+
+    // row0 of 1st and 2nd 8x8 block - d00 d10
+    dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+    // row1 of 1st and 2nd 8x8 block - d01 d11
+    dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+
+    // 2 rows of 1st 8x8 block - r00 r01
+    __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+    // 2 rows of 2nd 8x8 block - r10 r11
+    __m256i src1_16x16 =
+        _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+    // r00 r10 - 128bit
+    __m256i tmp0_16x16 =
+        _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20);
+    // r01 r11 - 128bit
+    __m256i tmp1_16x16 =
+        _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31);
+
+    // r15 r14 r13------------r1 r0 - 16 bit
+    sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16));
+    sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16));
+
+    // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each
+    src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+    src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+
+    // accumulation of result
+    src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+    square_result = _mm256_add_epi32(square_result, src0_16x16);
+    src_temp += 16;
+  }
+
+  // s5 s4 s1 s0  - 64bit
+  res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+  // s7 s6 s3 s2 - 64bit
+  res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+  // r3 r2 r1 r0 - 64bit
+  res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+  // r1+r3 r2+r0 - 64bit
+  const __m128i sum_1x64 =
+      _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+                    _mm256_extracti128_si256(res0_4x64, 1));
+  xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must be satisfied");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
+
+// Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8
+// block and Chroma uses 4x4 block. In src buffer, each block in a filter block
+// is stored sequentially. Hence src_blk_stride is same as block width. Whereas
+// dst buffer is a frame buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must be satisfied");
+  switch (w) {
+    case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+    case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
+
+static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
+                                          const uint8_t *ref,
+                                          __m256i set_one_minusone,
+                                          __m256i sse_8x16[2],
+                                          __m256i sum_8x16[2]) {
+  const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src));
+  const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref));
+
+  const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256);
+  const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256);
+
+  const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone);
+  const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone);
+
+  sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
+  sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
+  sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0);
+  sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
+}
+
+static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
+                                         unsigned int *tot_sse, int *tot_sum) {
+  // s00 s01 s10 s11 s20 s21 s30 s31
+  const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
+  // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33
+  const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]);
+  // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31
+  const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0);
+  // d00 d01 d10 d11 d20 d21 d30 d31 | X
+  const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08);
+  // d00 d01 d10 d11 d20 d21 d30 d31
+  const __m256i sum_results =
+      _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3));
+
+  // Add sum & sse registers appropriately to get total sum & sse separately.
+  // s0 s1 d0 d1 s2 s3 d2 d3
+  const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results);
+  // s0 s1 s2 s3 d0 d1 d2 d3
+  const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8);
+  // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3
+  const __m256i sum_sse_order_add_1 =
+      _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+  // s0 x x x | d0 x x x
+  const __m256i sum_sse_order_add_final =
+      _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1);
+  // s0
+  const uint32_t first_value =
+      (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0);
+  *tot_sse += first_value;
+  // d0
+  const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4);
+  *tot_sum += second_value;
+  return sum_sse_order_add;
+}
+
+static INLINE void get_var_sse_sum_8x8_quad_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref,
+    const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
+    unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
+  assert(h <= 128);  // May overflow for larger height.
+  __m256i sse_8x16[2], sum_8x16[2];
+  sum_8x16[0] = _mm256_setzero_si256();
+  sse_8x16[0] = _mm256_setzero_si256();
+  sum_8x16[1] = sum_8x16[0];
+  sse_8x16[1] = sse_8x16[0];
+  const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+  for (int i = 0; i < h; i++) {
+    // Process 8x32 block of one row.
+    calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16);
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  const __m256i sum_sse_order_add =
+      calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum);
+
+  // s0 s1 s2 s3
+  _mm_storeu_si128((__m128i *)sse8x8,
+                   _mm256_castsi256_si128(sum_sse_order_add));
+  // d0 d1 d2 d3
+  const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1);
+  _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8);
+
+  // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+  const __m128i mull_results =
+      _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6);
+  // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+  const __m128i variance_8x8 =
+      _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results);
+  // v0 v1 v2 v3
+  _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
+}
+
+static INLINE void get_var_sse_sum_16x16_dual_avx2(
+    const uint8_t *src, int src_stride, const uint8_t *ref,
+    const int ref_stride, const int h, uint32_t *sse16x16,
+    unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
+  assert(h <= 128);  // May overflow for larger height.
+  __m256i sse_16x16[2], sum_16x16[2];
+  sum_16x16[0] = _mm256_setzero_si256();
+  sse_16x16[0] = _mm256_setzero_si256();
+  sum_16x16[1] = sum_16x16[0];
+  sse_16x16[1] = sse_16x16[0];
+  const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+  for (int i = 0; i < h; i++) {
+    // Process 16x32 block of one row.
+    calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16);
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  const __m256i sum_sse_order_add =
+      calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum);
+
+  const __m256i sum_sse_order_add_1 =
+      _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+
+  // s0+s1 s2+s3 x x
+  _mm_storel_epi64((__m128i *)sse16x16,
+                   _mm256_castsi256_si128(sum_sse_order_add_1));
+
+  // d0+d1 d2+d3 x x
+  const __m128i sum_temp16x16 =
+      _mm256_extractf128_si256(sum_sse_order_add_1, 1);
+
+  // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+  const __m128i mull_results =
+      _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8);
+
+  // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+  const __m128i variance_16x16 =
+      _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results);
+
+  // v0 v1 v2 v3
+  _mm_storel_epi64((__m128i *)var16x16, variance_16x16);
+}
+
+void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr,
+                                       int source_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
+  get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+                                sse8x8, sum8x8, tot_sse, tot_sum, var8x8);
+}
+
+void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr,
+                                         int source_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride,
+                                  16, sse16x16, tot_sse, tot_sum, var16x16);
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 0000000000..9e9e70ea01
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>  // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0, 16,  0,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+  10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,  6, 10,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,  4, 12,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+   2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,  2, 14,
+};
+/* clang-format on */
+
+#define FILTER_SRC(filter)                               \
+  /* filter the source */                                \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+                                                         \
+  /* add 8 to source */                                  \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8);        \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8);        \
+                                                         \
+  /* divide source by 16 */                              \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4);         \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg)               \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST                                    \
+  /* load source and destination */                     \
+  src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride)                                 \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  /* average between current and next stride source */                     \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride)                               \
+  src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP                          \
+  /* expand each byte to 2 bytes */                       \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg);   \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg);   \
+  /* source - dest */                                     \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo);  \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi);  \
+  /* caculate sum */                                      \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo);        \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi);        \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */                                     \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo);        \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE                                                   \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg);                         \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8);                              \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp);                    \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp);                    \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi);                      \
+                                                                           \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4);                              \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8);                              \
+                                                                           \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi);                         \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) +     \
+                  _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4);                              \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi);                         \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
+  /* load source and destination of 2 rows and insert*/          \
+  src_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
+  /* average between current and next stride source */                         \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT                                    \
+  /* load source and another source from next row   */               \
+  src_reg = _mm256_inserti128_si256(                                 \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
+  /* load source and next row source from 1 byte onwards   */        \
+  src_next_reg = _mm256_inserti128_si256(                            \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT                                          \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter)                        \
+  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
+  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
+  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
+  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
+  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter)             \
+  /* filter the source */                     \
+  src_lo = _mm_maddubs_epi16(src_lo, filter); \
+  src_hi = _mm_maddubs_epi16(src_hi, filter); \
+                                              \
+  /* add 8 to source */                       \
+  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+                                              \
+  /* divide source by 16 */                   \
+  src_lo = _mm_srai_epi16(src_lo, 4);         \
+  src_hi = _mm_srai_epi16(src_hi, 4);
+
+// TODO(chiyotsai@google.com): These variance functions are macro-fied so we
+// don't have to manually optimize the individual for-loops. We could save some
+// binary size by optimizing the loops more carefully without duplicating the
+// codes with a macro.
+#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height)                           \
+  static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2(         \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, src_stride)                                   \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+                                                                              \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg;                                        \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* average between previous average to current average */           \
+          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          /* save current source average */                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg, src_avg;                           \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          MERGE_WITH_SRC(src_avg, src_reg)                                    \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src_pack = src_reg;                                                 \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+      } else {                                                                \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+                                                                              \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* merge previous pack to current pack source */                    \
+          MERGE_WITH_SRC(src_pack, src_reg)                                   \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return sum;                                                               \
+  }                                                                           \
+  unsigned int aom_sub_pixel_variance32x##height##_avx2(                      \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    const int sum = aom_sub_pixel_variance32x##height##_imp_avx2(             \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sse);           \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
+  }
+
+MAKE_SUB_PIXEL_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)                \
+  unsigned int aom_sub_pixel_variance##w##x##h##_avx2(                    \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {        \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2(     \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            &sse2);                                                       \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
+
+#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height)                           \
+  unsigned int aom_sub_pixel_variance16x##height##_avx2(                      \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse) {                \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = 4 */                                   \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          AVG_NEXT_SRC_INSERT(src_reg, src_stride)                            \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          MERGE_NEXT_SRC_INSERT(src_reg, src_stride)                          \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          /* average between current and next stride source */                \
+          src_reg = _mm256_avg_epu8(src_reg, src_next_reg);                   \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg, src_temp;                              \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
+          src_temp = _mm256_avg_epu8(src_avg, src_temp);                      \
+          LOAD_DST_INSERT                                                     \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_temp, zero_reg)                                  \
+          /* save current source average */                                   \
+          src_avg = src_next_reg;                                             \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride << 1;                                             \
+          src += src_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
+        LOAD_DST_INSERT                                                       \
+        src_avg = _mm256_avg_epu8(src_avg, src_next_reg);                     \
+        MERGE_WITH_SRC(src_avg, zero_reg)                                     \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      } else {                                                                \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+        __m256i filter, pw8, src_next_reg, src_avg, src_temp;                 \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        src_avg = _mm256_avg_epu8(src_reg, src_next_reg);                     \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);              \
+          src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);  \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_avg, src_temp)                                   \
+          /* save current source average */                                   \
+          src_avg = src_next_reg;                                             \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride << 1;                                             \
+          src += src_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));                \
+        __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));            \
+        src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);                       \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);                \
+        LOAD_DST_INSERT                                                       \
+        MERGE_WITH_SRC(src_avg, src_next_reg)                                 \
+        FILTER_SRC(filter)                                                    \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i += 2) {                                     \
+          LOAD_SRC_DST_INSERT(src_stride, dst_stride)                         \
+          MERGE_NEXT_SRC_INSERT(src_reg, 1)                                   \
+          FILTER_SRC(filter)                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += (src_stride << 1);                                           \
+          dst += (dst_stride << 1);                                           \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                 \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src_pack = src_reg;                                                 \
+          src += src_stride << 1;                                             \
+          dst += dst_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        LOAD_SRC_MERGE_128BIT(filter)                                         \
+        LOAD_DST_INSERT                                                       \
+        FILTER_SRC_128BIT(filter_128bit)                                      \
+        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
+        /* average between previous pack to the current */                    \
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);                   \
+        MERGE_WITH_SRC(src_pack, zero_reg)                                    \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      } else {                                                                \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load and insert source and next row source */                      \
+        LOAD_SRC_NEXT_BYTE_INSERT                                             \
+        MERGE_WITH_SRC(src_reg, src_next_reg)                                 \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        src += src_stride << 1;                                               \
+        for (i = 0; i < height - 2; i += 2) {                                 \
+          LOAD_SRC_NEXT_BYTE_INSERT                                           \
+          LOAD_DST_INSERT                                                     \
+          MERGE_WITH_SRC(src_reg, src_next_reg)                               \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);  \
+          /* average between previous pack to the current */                  \
+          MERGE_WITH_SRC(src_pack, src_next_reg)                              \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride << 1;                                             \
+          dst += dst_stride << 1;                                             \
+        }                                                                     \
+        /* last 2 rows processing happens here */                             \
+        LOAD_SRC_MERGE_128BIT(xfilter)                                        \
+        LOAD_DST_INSERT                                                       \
+        FILTER_SRC_128BIT(filter_128bit)                                      \
+        src_reg_0 = _mm_packus_epi16(src_lo, src_hi);                         \
+        src_next_reg = _mm256_permute2x128_si256(                             \
+            src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);               \
+        MERGE_WITH_SRC(src_pack, src_next_reg)                                \
+        FILTER_SRC(yfilter)                                                   \
+        CALC_SUM_SSE_INSIDE_LOOP                                              \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height));   \
+  }
+
+MAKE_SUB_PIXEL_VAR_16XH(32, 5)
+MAKE_SUB_PIXEL_VAR_16XH(16, 4)
+MAKE_SUB_PIXEL_VAR_16XH(8, 3)
+#if !CONFIG_REALTIME_ONLY
+MAKE_SUB_PIXEL_VAR_16XH(64, 6)
+MAKE_SUB_PIXEL_VAR_16XH(4, 2)
+#endif
+
+#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height)                       \
+  int aom_sub_pixel_avg_variance32x##height##_imp_avx2(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
+      unsigned int *sse) {                                                    \
+    __m256i sec_reg;                                                          \
+    __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+    __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;    \
+    __m256i zero_reg;                                                         \
+    int i, sum;                                                               \
+    sum_reg = _mm256_setzero_si256();                                         \
+    sse_reg = _mm256_setzero_si256();                                         \
+    zero_reg = _mm256_setzero_si256();                                        \
+                                                                              \
+    /* x_offset = 0 and y_offset = 0 */                                       \
+    if (x_offset == 0) {                                                      \
+      if (y_offset == 0) {                                                    \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, src_stride)                                   \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expend each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 0 and y_offset = bilin interpolation */                 \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg;                                    \
+                                                                              \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, src_stride)                                 \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = 4  and y_offset = 0 */                                    \
+    } else if (x_offset == 4) {                                               \
+      if (y_offset == 0) {                                                    \
+        __m256i src_next_reg;                                                 \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = 4 */                                  \
+      } else if (y_offset == 4) {                                             \
+        __m256i src_next_reg, src_avg;                                        \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          /* average between previous average to current average */           \
+          src_avg = _mm256_avg_epu8(src_avg, src_reg);                        \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
+          sec += sec_stride;                                                  \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = 4  and y_offset = bilin interpolation */                \
+      } else {                                                                \
+        __m256i filter, pw8, src_next_reg, src_avg;                           \
+        y_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        AVG_NEXT_SRC(src_reg, 1)                                              \
+        for (i = 0; i < height; i++) {                                        \
+          /* save current source average */                                   \
+          src_avg = src_reg;                                                  \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          AVG_NEXT_SRC(src_reg, 1)                                            \
+          MERGE_WITH_SRC(src_avg, src_reg)                                    \
+          FILTER_SRC(filter)                                                  \
+          src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_avg = _mm256_avg_epu8(src_avg, sec_reg);                        \
+          /* expand each byte to 2 bytes */                                   \
+          MERGE_WITH_SRC(src_avg, zero_reg)                                   \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+      /* x_offset = bilin interpolation and y_offset = 0 */                   \
+    } else {                                                                  \
+      if (y_offset == 0) {                                                    \
+        __m256i filter, pw8, src_next_reg;                                    \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        for (i = 0; i < height; i++) {                                        \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_reg = _mm256_avg_epu8(src_reg, sec_reg);                        \
+          MERGE_WITH_SRC(src_reg, zero_reg)                                   \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          src += src_stride;                                                  \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = 4 */                 \
+      } else if (y_offset == 4) {                                             \
+        __m256i filter, pw8, src_next_reg, src_pack;                          \
+        x_offset <<= 5;                                                       \
+        filter = _mm256_load_si256(                                           \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+        FILTER_SRC(filter)                                                    \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(filter)                                                  \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* average between previous pack to the current */                  \
+          src_pack = _mm256_avg_epu8(src_pack, src_reg);                      \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
+          sec += sec_stride;                                                  \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          src_pack = src_reg;                                                 \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+        /* x_offset = bilin interpolation and y_offset = bilin interpolation  \
+         */                                                                   \
+      } else {                                                                \
+        __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;                \
+        x_offset <<= 5;                                                       \
+        xfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + x_offset));             \
+        y_offset <<= 5;                                                       \
+        yfilter = _mm256_load_si256(                                          \
+            (__m256i const *)(bilinear_filters_avx2 + y_offset));             \
+        pw8 = _mm256_set1_epi16(8);                                           \
+        /* load source and another source starting from the next */           \
+        /* following byte */                                                  \
+        src_reg = _mm256_loadu_si256((__m256i const *)(src));                 \
+        MERGE_NEXT_SRC(src_reg, 1)                                            \
+                                                                              \
+        FILTER_SRC(xfilter)                                                   \
+        /* convert each 16 bit to 8 bit to each low and high lane source */   \
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);               \
+        for (i = 0; i < height; i++) {                                        \
+          src += src_stride;                                                  \
+          LOAD_SRC_DST                                                        \
+          MERGE_NEXT_SRC(src_reg, 1)                                          \
+          FILTER_SRC(xfilter)                                                 \
+          src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);              \
+          /* merge previous pack to current pack source */                    \
+          MERGE_WITH_SRC(src_pack, src_reg)                                   \
+          /* filter the source */                                             \
+          FILTER_SRC(yfilter)                                                 \
+          src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);             \
+          sec_reg = _mm256_loadu_si256((__m256i const *)(sec));               \
+          src_pack = _mm256_avg_epu8(src_pack, sec_reg);                      \
+          MERGE_WITH_SRC(src_pack, zero_reg)                                  \
+          src_pack = src_reg;                                                 \
+          sec += sec_stride;                                                  \
+          CALC_SUM_SSE_INSIDE_LOOP                                            \
+          dst += dst_stride;                                                  \
+        }                                                                     \
+      }                                                                       \
+    }                                                                         \
+    CALC_SUM_AND_SSE                                                          \
+    _mm256_zeroupper();                                                       \
+    return sum;                                                               \
+  }                                                                           \
+  unsigned int aom_sub_pixel_avg_variance32x##height##_avx2(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse,                  \
+      const uint8_t *sec_ptr) {                                               \
+    const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2(         \
+        src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32,    \
+        sse);                                                                 \
+    return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
+  }
+
+MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2)            \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,     \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,          \
+      const uint8_t *sec) {                                               \
+    unsigned int sse = 0;                                                 \
+    int se = 0;                                                           \
+    for (int i = 0; i < (w / wf); ++i) {                                  \
+      const uint8_t *src_ptr = src;                                       \
+      const uint8_t *dst_ptr = dst;                                       \
+      const uint8_t *sec_ptr = sec;                                       \
+      for (int j = 0; j < (h / hf); ++j) {                                \
+        unsigned int sse2;                                                \
+        const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+            sec_ptr, w, &sse2);                                           \
+        dst_ptr += hf * dst_stride;                                       \
+        src_ptr += hf * src_stride;                                       \
+        sec_ptr += hf * w;                                                \
+        se += se2;                                                        \
+        sse += sse2;                                                      \
+      }                                                                   \
+      src += wf;                                                          \
+      dst += wf;                                                          \
+      sec += wf;                                                          \
+    }                                                                     \
+    *sse_ptr = sse;                                                       \
+    return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2));   \
+  }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 0000000000..699002195b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+    const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+  // in computation using _mm_maddubs_epi16.
+  // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+  const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+  const __m128i r = _mm_set1_epi16(round);
+  const int8_t f0 = (int8_t)(filter[0] >> 1);
+  const int8_t f1 = (int8_t)(filter[1] >> 1);
+  const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+                                        f0, f1, f0, f1, f0, f1);
+  unsigned int i, j;
+  (void)pixel_step;
+
+  if (output_width >= 8) {
+    for (i = 0; i < output_height; ++i) {
+      for (j = 0; j < output_width; j += 8) {
+        // load source
+        __m128i source_low = xx_loadl_64(a);
+        __m128i source_hi = xx_loadl_64(a + 1);
+
+        // unpack to:
+        // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+        //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+        __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+        // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+        __m128i res = _mm_maddubs_epi16(source, filters);
+
+        // round
+        res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+        xx_storeu_128(b, res);
+
+        a += 8;
+        b += 8;
+      }
+
+      a += src_pixels_per_line - output_width;
+    }
+  } else {
+    const __m128i shuffle_mask =
+        _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+    for (i = 0; i < output_height; ++i) {
+      // load source, only first 5 values are meaningful:
+      // { a[0], a[1], a[2], a[3], a[4], xxxx }
+      __m128i source = xx_loadl_64(a);
+
+      // shuffle, up to the first 8 are useful
+      // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+      //   a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+      res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+      xx_storel_64(b, res);
+
+      a += src_pixels_per_line;
+      b += output_width;
+    }
+  }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+    const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+    unsigned int pixel_step, unsigned int output_height,
+    unsigned int output_width, const uint8_t *filter) {
+  const int16_t round = (1 << FILTER_BITS) >> 1;
+  const __m128i r = _mm_set1_epi32(round);
+  const __m128i filters =
+      _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+                     filter[1], filter[0], filter[1]);
+  const __m128i shuffle_mask =
+      _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+  const __m128i mask =
+      _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; j += 4) {
+      // load source as:
+      // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+      __m128i source1 = xx_loadl_64(a);
+      __m128i source2 = xx_loadl_64(a + pixel_step);
+      __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+      // shuffle source to:
+      // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+      __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+      // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+      __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+      // round
+      res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+      // shuffle to get each lower 8 bit of every 32 bit
+      res = _mm_shuffle_epi8(res, mask);
+
+      xx_storel_32(b, res);
+
+      a += 4;
+      b += 4;
+    }
+
+    a += src_pixels_per_line - output_width;
+  }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..faec9cf73d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = xx_loadu_128(src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return (unsigned int)_mm_cvtsi128_si32(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+  return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
+
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+  const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+  return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
+
+static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
+  const __m128i p0 = _mm_loadu_si128((const __m128i *)p);
+  out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128());  // lower 8 values
+  out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128());  // upper 8 values
+}
+
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+  val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+  return (unsigned int)_mm_cvtsi128_si32(val);
+}
+
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+  const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+  const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+  return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+                                        __m128i *const sse,
+                                        __m128i *const sum) {
+  const __m128i diff = _mm_sub_epi16(src, ref);
+  *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+  *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+  *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+                                               unsigned int *const sse,
+                                               int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_unpacklo_epi16(vsum, vsum);
+  vsum = _mm_srai_epi32(vsum, 16);
+  *sum = (int)add32x4_sse2(vsum);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+                                                unsigned int *const sse,
+                                                int *const sum) {
+  *sse = add32x4_sse2(vsse);
+
+  vsum = sum_to_32bit_sse2(vsum);
+  *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 256);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i += 2) {
+    const __m128i s = load4x2_sse2(src, src_stride);
+    const __m128i r = load4x2_sse2(ref, ref_stride);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += 2 * src_stride;
+    ref += 2 * ref_stride;
+  }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+                                  const uint8_t *ref, const int ref_stride,
+                                  const int h, __m128i *const sse,
+                                  __m128i *const sum) {
+  assert(h <= 128);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+  *sse = _mm_setzero_si128();
+  for (int i = 0; i < h; i++) {
+    const __m128i s = load8_8to16_sse2(src);
+    const __m128i r = load8_8to16_sse2(ref);
+
+    variance_kernel_sse2(s, r, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+                                          const uint8_t *const ref,
+                                          __m128i *const sse,
+                                          __m128i *const sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i s = _mm_loadu_si128((const __m128i *)src);
+  const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+  const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+  const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+  const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+  variance_kernel_sse2(src0, ref0, sse, sum);
+  variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 64);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src, ref, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 32);  // May overflow for larger height.
+  // Don't initialize sse here since it's an accumulation.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+                                   const uint8_t *ref, const int ref_stride,
+                                   const int h, __m128i *const sse,
+                                   __m128i *const sum) {
+  assert(h <= 16);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+    variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+    variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+    variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+                                    const uint8_t *ref, const int ref_stride,
+                                    const int h, __m128i *const sse,
+                                    __m128i *const sum) {
+  assert(h <= 8);  // May overflow for larger height.
+  *sum = _mm_setzero_si128();
+
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < 4; ++j) {
+      const int offset0 = j << 5;
+      const int offset1 = offset0 + 16;
+      variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+      variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+    }
+    src += src_stride;
+    ref += ref_stride;
+  }
+}
+
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
+                                       const uint8_t *ref_ptr, int ref_stride,
+                                       uint32_t *sse8x8, int *sum8x8,
+                                       unsigned int *tot_sse, int *tot_sum,
+                                       uint32_t *var8x8) {
+  // Loop over 4 8x8 blocks. Process one 8x32 block.
+  for (int k = 0; k < 4; k++) {
+    const uint8_t *src = src_ptr;
+    const uint8_t *ref = ref_ptr;
+    __m128i vsum = _mm_setzero_si128();
+    __m128i vsse = _mm_setzero_si128();
+    for (int i = 0; i < 8; i++) {
+      const __m128i s = load8_8to16_sse2(src + (k * 8));
+      const __m128i r = load8_8to16_sse2(ref + (k * 8));
+      const __m128i diff = _mm_sub_epi16(s, r);
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff));
+      vsum = _mm_add_epi16(vsum, diff);
+
+      src += src_stride;
+      ref += ref_stride;
+    }
+    variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]);
+  }
+
+  // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+  *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+  *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+  for (int i = 0; i < 4; i++)
+    var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride,
+                                         const uint8_t *ref_ptr, int ref_stride,
+                                         uint32_t *sse16x16,
+                                         unsigned int *tot_sse, int *tot_sum,
+                                         uint32_t *var16x16) {
+  int sum16x16[2] = { 0 };
+  // Loop over 2 16x16 blocks. Process one 16x32 block.
+  for (int k = 0; k < 2; k++) {
+    const uint8_t *src = src_ptr;
+    const uint8_t *ref = ref_ptr;
+    __m128i vsum = _mm_setzero_si128();
+    __m128i vsse = _mm_setzero_si128();
+    for (int i = 0; i < 16; i++) {
+      __m128i s[2];
+      __m128i r[2];
+      load16_8to16_sse2(src + (k * 16), s);
+      load16_8to16_sse2(ref + (k * 16), r);
+      const __m128i diff0 = _mm_sub_epi16(s[0], r[0]);
+      const __m128i diff1 = _mm_sub_epi16(s[1], r[1]);
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+      vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+      vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1));
+      src += src_stride;
+      ref += ref_stride;
+    }
+    variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]);
+  }
+
+  // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+  *tot_sse += sse16x16[0] + sse16x16[1];
+  *tot_sum += sum16x16[0] + sum16x16[1];
+  for (int i = 0; i < 2; i++)
+    var16x16[i] =
+        sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels)                        \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum;                                                             \
+    int sum = 0;                                                              \
+    variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum);  \
+    variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum);            \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128)
+
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128)
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
+#endif
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh)                                   \
+  unsigned int aom_variance##bw##x##bh##_sse2(                                \
+      const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+      unsigned int *sse) {                                                    \
+    __m128i vsse = _mm_setzero_si128();                                       \
+    __m128i vsum = _mm_setzero_si128();                                       \
+    for (int i = 0; i < (bh / uh); ++i) {                                     \
+      __m128i vsum16;                                                         \
+      variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse,        \
+                          &vsum16);                                           \
+      vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16));                  \
+      src += (src_stride * uh);                                               \
+      ref += (ref_stride * uh);                                               \
+    }                                                                         \
+    *sse = add32x4_sse2(vsse);                                                \
+    int sum = (int)add32x4_sse2(vsum);                                        \
+    assert(sum <= 255 * bw * bh);                                             \
+    assert(sum >= -255 * bw * bh);                                            \
+    return *sse - (uint32_t)(((int64_t)sum * sum) >> bits);                   \
+  }
+
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32)  // 32x32 * ( 64/32 )
+
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16)   // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16)   // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16)  // 64x16 * ( 128/16 )
+
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8)   // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8)  // 128x8 * ( 128/8 )
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024)
+#endif
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt)                                                           \
+  int aom_sub_pixel_variance##w##xh_##opt(                                     \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
+      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+      void *unused0, void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
+  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
+    /*Avoid overflow in helper by capping height.*/                           \
+    const int hf = AOMMIN(h, 64);                                             \
+    unsigned int sse = 0;                                                     \
+    int se = 0;                                                               \
+    for (int i = 0; i < (w / wf); ++i) {                                      \
+      const uint8_t *src_ptr = src;                                           \
+      const uint8_t *dst_ptr = dst;                                           \
+      for (int j = 0; j < (h / hf); ++j) {                                    \
+        unsigned int sse2;                                                    \
+        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+            &sse2, NULL, NULL);                                               \
+        dst_ptr += hf * dst_stride;                                           \
+        src_ptr += hf * src_stride;                                           \
+        se += se2;                                                            \
+        sse += sse2;                                                          \
+      }                                                                       \
+      src += wf;                                                              \
+      dst += wf;                                                              \
+    }                                                                         \
+    *sse_ptr = sse;                                                           \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
+  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
+  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
+  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
+  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
+  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt)                                                        \
+  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
+      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
+      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
+      void *unused)
+#define DECLS(opt) \
+  DECL(4, opt);    \
+  DECL(8, opt);    \
+  DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
+  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
+      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
+      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
+      const uint8_t *sec) {                                                  \
+    /*Avoid overflow in helper by capping height.*/                          \
+    const int hf = AOMMIN(h, 64);                                            \
+    unsigned int sse = 0;                                                    \
+    int se = 0;                                                              \
+    for (int i = 0; i < (w / wf); ++i) {                                     \
+      const uint8_t *src_ptr = src;                                          \
+      const uint8_t *dst_ptr = dst;                                          \
+      const uint8_t *sec_ptr = sec;                                          \
+      for (int j = 0; j < (h / hf); ++j) {                                   \
+        unsigned int sse2;                                                   \
+        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
+            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
+            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
+        dst_ptr += hf * dst_stride;                                          \
+        src_ptr += hf * src_stride;                                          \
+        sec_ptr += hf * w;                                                   \
+        se += se2;                                                           \
+        sse += sse2;                                                         \
+      }                                                                      \
+      src += wf;                                                             \
+      dst += wf;                                                             \
+      sec += wf;                                                             \
+    }                                                                        \
+    *sse_ptr = sse;                                                          \
+    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+  }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
+  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
+  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
+  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
+  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt)                                    \
+  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
+  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
+  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
+  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
+  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
+  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
+  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
+  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
+  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
+  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+                                                      const __m128i s1,
+                                                      const __m128i a) {
+  const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+  const __m128i round_const =
+      _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+  const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+  const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+  const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+  const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+  const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+  const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+  const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+  const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+                                        AOM_BLEND_A64_ROUND_BITS);
+
+  const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+  return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+                                    int width, int height, const uint8_t *ref8,
+                                    int ref_stride, const uint8_t *mask,
+                                    int mask_stride, int invert_mask) {
+  int i = 0;
+  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  const uint16_t *src0 = invert_mask ? pred : ref;
+  const uint16_t *src1 = invert_mask ? ref : pred;
+  const int stride0 = invert_mask ? width : ref_stride;
+  const int stride1 = invert_mask ? ref_stride : width;
+  const __m128i zero = _mm_setzero_si128();
+
+  if (width == 8) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+      const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else if (width == 16) {
+    do {
+      const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+      const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+      const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+      const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+      const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+      const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+      const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+      const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+      const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+      _mm_storeu_si128((__m128i *)comp_pred, comp);
+      _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      comp_pred += width;
+      i += 1;
+    } while (i < height);
+  } else {
+    do {
+      for (int x = 0; x < width; x += 32) {
+        for (int j = 0; j < 2; j++) {
+          const __m128i s0 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + j * 16));
+          const __m128i s2 =
+              _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16));
+          const __m128i s1 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + j * 16));
+          const __m128i s3 =
+              _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16));
+
+          const __m128i m_8 =
+              _mm_loadu_si128((const __m128i *)(mask + x + j * 16));
+          const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+          const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+          const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+          const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+          _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+          _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+        }
+        comp_pred += 32;
+      }
+      src0 += stride0;
+      src1 += stride1;
+      mask += mask_stride;
+      i += 1;
+    } while (i < height);
+  }
+}
+
+uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst0_8x8, dst1_8x8, dst_16x8;
+  __m128i src0_16x4, src1_16x4, src_16x8;
+  __m128i res0_32x4, res0_64x2, res1_64x2;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+  for (int i = 0; i < h; i += 2) {
+    dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+    dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
+
+    src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+    src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+    src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+    res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+    square_result =
+        _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+  }
+  const __m128i sum_64x1 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_64x1);
+  return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int h) {
+  uint64_t sum = 0;
+  __m128i dst_8x8, dst_16x8;
+  __m128i src_16x8;
+  __m128i res0_32x4, res0_64x2, res1_64x2;
+  __m128i sub_result_16x8;
+  const __m128i zeros = _mm_setzero_si128();
+  __m128i square_result = _mm_setzero_si128();
+
+  for (int i = 0; i < h; i++) {
+    dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+    dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
+
+    src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+    sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+    res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+    res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+    res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+    square_result =
+        _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+  }
+  const __m128i sum_64x1 =
+      _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+  xx_storel_64(&sum, sum_64x1);
+  return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                int sstride, int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must satisfy");
+  switch (w) {
+    case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+    case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+    default: assert(0 && "unsupported width"); return -1;
+  }
+}
+
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+         "w=8/4 and h=8/4 must be satisfied");
+  const int num_blks = 16 / w;
+  uint64_t sum = 0;
+  for (int i = 0; i < num_blks; i++) {
+    sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h);
+    dst += w;
+    src += (w * h);
+  }
+  return sum;
+}
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-04-19 00:47:55 +0000
commit	26a029d407be480d791972afb5975cf62c9360a6 (patch)
tree	f435a8308119effd964b339f76abb83a57c29483 /third_party/aom/aom_dsp
parent	Initial commit. (diff)
download	firefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz firefox-26a029d407be480d791972afb5975cf62c9360a6.zip