Merging upstream version 126.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-05-15 03:35:49 +0000
commit: d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree: eff41dc61d9f714852212739e6b3738b82a2af87 /third_party/aom/aom_dsp/x86
parent: Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download: firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz
firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip
16 files changed, 147 insertions, 1627 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index b08ec2546b..6c7fdd6eb1 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -15,40 +15,6 @@
 #include "aom_dsp/x86/convolve.h"
 
 #if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v4_sse2;
-filter8_1dfunction aom_filter_block1d16_h4_sse2;
-
-filter8_1dfunction aom_filter_block1d8_h4_sse2;
-filter8_1dfunction aom_filter_block1d8_v4_sse2;
-filter8_1dfunction aom_filter_block1d4_h4_sse2;
-filter8_1dfunction aom_filter_block1d4_v4_sse2;
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-
 #if CONFIG_AV1_HIGHBITDEPTH
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
 highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
deleted file mode 100644
index 5c36b68727..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
+++ /dev/null
@@ -1,569 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_ports/mem.h"
-
-void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
-                                  ptrdiff_t src_pixels_per_line,
-                                  uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
-      srcRegFilt32b2_2;
-  __m128i srcReg32b1, srcReg32b2;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
-    __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // reading stride of the next 16 bytes
-    // (part of it was being read by earlier read)
-    srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
-    ss_2 = _mm_srli_si128(srcReg32b2, 2);
-    ss_4 = _mm_srli_si128(srcReg32b2, 4);
-    ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
-    srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
-
-    ss_1 = _mm_srli_si128(srcReg32b2, 3);
-    ss_3 = _mm_srli_si128(srcReg32b2, 5);
-    ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
-    ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_1_2, secondFilters);
-    d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
-    srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
-
-    res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
-    srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-    srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                  uint32_t output_height,
-                                  const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
-  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
-  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
-  __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
-  __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
-    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
-    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
-    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
-    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
-    resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
-    resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
-    __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
-    resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
-    __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
-    resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
-    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
-    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
-
-    src_ptr += src_stride;
-
-    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
-    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23_lo_1 = resReg45_lo_1;
-    resReg23_lo_2 = resReg45_lo_2;
-    resReg23_hi_1 = resReg45_hi_1;
-    resReg23_hi_2 = resReg45_hi_2;
-    resReg34_lo_1 = resReg56_lo_1;
-    resReg34_lo_2 = resReg56_lo_2;
-    resReg34_hi_1 = resReg56_hi_1;
-    resReg34_hi_2 = resReg56_hi_2;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_pixels_per_line,
-                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-    d1 = _mm_madd_epi16(ss_3, secondFilters);
-    d2 = _mm_madd_epi16(ss_5, thirdFilters);
-    srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
-    __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
-    srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23_lo, srcReg34_lo;
-  __m128i srcReg45_lo, srcReg56_lo;
-  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
-  __m128i resReg23_45_lo, resReg34_56_lo;
-  __m128i resReg23_45, resReg34_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
-  __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
-  __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-
-    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-
-    tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
-    resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
-    resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
-    __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
-    resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
-    __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
-    tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
-    resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
-    // shift by 6 bit each 16 bit
-    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
-    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
-    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
-    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
-    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
-
-    src_ptr += src_stride;
-
-    _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
-    _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23_lo_1 = resReg45_lo_1;
-    resReg23_lo_2 = resReg45_lo_2;
-    resReg34_lo_1 = resReg56_lo_1;
-    resReg34_lo_2 = resReg56_lo_2;
-    srcReg4 = srcReg6;
-  }
-}
-
-void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
-                                 ptrdiff_t src_pixels_per_line,
-                                 uint8_t *output_ptr, ptrdiff_t output_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i addFilterReg32;
-  __m128i secondFilters, thirdFilters;
-  __m128i srcRegFilt32b1_1;
-  __m128i srcReg32b1;
-  unsigned int i;
-  src_ptr -= 3;
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1);   // coeffs 4 5 4 5 4 5 4 5
-
-  for (i = output_height; i > 0; i -= 1) {
-    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
-    __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
-    __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
-    __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
-    __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-
-    ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
-    ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
-    ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
-    ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-
-    __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
-    __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
-
-    __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
-    __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
-    srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
-    srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    // shift by 6 bit each 16 bit
-    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
-    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve result
-    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
-    src_ptr += src_pixels_per_line;
-
-    *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
-
-    output_ptr += output_pitch;
-  }
-}
-
-void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *output_ptr, ptrdiff_t out_pitch,
-                                 uint32_t output_height,
-                                 const int16_t *filter) {
-  __m128i filtersReg;
-  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
-  __m128i srcReg23, srcReg34, srcReg45, srcReg56;
-  __m128i resReg23_34, resReg45_56;
-  __m128i resReg23_34_45_56;
-  __m128i addFilterReg32, secondFilters, thirdFilters;
-  __m128i tmp_0, tmp_1;
-  unsigned int i;
-  ptrdiff_t src_stride, dst_stride;
-
-  addFilterReg32 = _mm_set1_epi16(32);
-  filtersReg = _mm_loadu_si128((const __m128i *)filter);
-  filtersReg = _mm_srai_epi16(filtersReg, 1);
-
-  // coeffs 0 1 0 1 2 3 2 3
-  const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
-  // coeffs 4 5 4 5 6 7 6 7
-  const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
-  secondFilters = _mm_unpackhi_epi64(tmp0, tmp0);  // coeffs 2 3 2 3 2 3 2 3
-  thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1);   // coeffs 4 5 4 5 4 5 4 5
-
-  // multiply the size of the source and destination stride by two
-  src_stride = src_pitch << 1;
-  dst_stride = out_pitch << 1;
-
-  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
-  __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
-
-  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
-  __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
-
-  for (i = output_height; i > 1; i -= 2) {
-    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-    srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
-    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-    srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
-    // multiply 2 adjacent elements with the filter and add the result
-    tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
-    tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
-    resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
-
-    __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
-    __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
-
-    tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
-    tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
-    resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
-
-    // add and saturate the results together
-    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
-
-    // shift by 6 bit each 16 bit
-    resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
-    resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
-
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    resReg23_34_45_56 =
-        _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
-
-    src_ptr += src_stride;
-
-    *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
-    *((int *)(output_ptr + out_pitch)) =
-        _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
-
-    output_ptr += dst_stride;
-
-    // save part of the registers for next strides
-    resReg23 = resReg45;
-    resReg34 = resReg56;
-    srcReg4 = srcReg6;
-  }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
deleted file mode 100644
index 640c5b2416..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,615 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    psrldq      xmm7, 8
-    pshuflw     xmm4, xmm7, 0b              ;k4
-    pshuflw     xmm5, xmm7, 01010101b       ;k5
-    pshuflw     xmm6, xmm7, 10101010b       ;k6
-    pshuflw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklqdq  xmm0, xmm1
-    punpcklqdq  xmm2, xmm3
-    punpcklqdq  xmm5, xmm4
-    punpcklqdq  xmm6, xmm7
-
-    movdqa      k0k1, xmm0
-    movdqa      k2k3, xmm2
-    movdqa      k5k4, xmm5
-    movdqa      k6k7, xmm6
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpckldq   xmm6, xmm7
-    punpckldq   xmm2, xmm3
-    punpckldq   xmm5, xmm4
-
-    punpcklbw   xmm0, zero                  ;unpack to word
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-
-    pmullw      xmm0, k0k1                  ;multiply the filter factors
-    pmullw      xmm6, k6k7
-    pmullw      xmm2, k2k3
-    pmullw      xmm5, k5k4
-
-    paddsw      xmm0, xmm6                  ;sum
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm2
-    psrldq      xmm2, 8
-    paddsw      xmm0, xmm5
-    psrldq      xmm5, 8
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movd        [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-    pshuflw     xmm0, xmm7, 0b              ;k0
-    pshuflw     xmm1, xmm7, 01010101b       ;k1
-    pshuflw     xmm2, xmm7, 10101010b       ;k2
-    pshuflw     xmm3, xmm7, 11111111b       ;k3
-    pshufhw     xmm4, xmm7, 0b              ;k4
-    pshufhw     xmm5, xmm7, 01010101b       ;k5
-    pshufhw     xmm6, xmm7, 10101010b       ;k6
-    pshufhw     xmm7, xmm7, 11111111b       ;k7
-
-    punpcklwd   xmm0, xmm0
-    punpcklwd   xmm1, xmm1
-    punpcklwd   xmm2, xmm2
-    punpcklwd   xmm3, xmm3
-    punpckhwd   xmm4, xmm4
-    punpckhwd   xmm5, xmm5
-    punpckhwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movdqa      k0,   xmm0                  ;store filter factors on stack
-    movdqa      k1,   xmm1
-    movdqa      k2,   xmm2
-    movdqa      k3,   xmm3
-    movdqa      k4,   xmm4
-    movdqa      k5,   xmm5
-    movdqa      k6,   xmm6
-    movdqa      k7,   xmm7
-
-    movq        xmm6, rcx
-    pshufd      xmm6, xmm6, 0
-    movdqa      krd, xmm6                   ;rounding
-
-    pxor        xmm7, xmm7
-    movdqa      zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
-    movq        xmm0, [rsi + %1]            ;0
-    movq        xmm1, [rsi + rax + %1]      ;1
-    movq        xmm6, [rsi + rdx * 2 + %1]  ;6
-    lea         rsi,  [rsi + rax]
-    movq        xmm7, [rsi + rdx * 2 + %1]  ;7
-    movq        xmm2, [rsi + rax + %1]      ;2
-    movq        xmm3, [rsi + rax * 2 + %1]  ;3
-    movq        xmm4, [rsi + rdx + %1]      ;4
-    movq        xmm5, [rsi + rax * 4 + %1]  ;5
-%endm
-
-%macro APPLY_FILTER_8 2
-    punpcklbw   xmm0, zero
-    punpcklbw   xmm1, zero
-    punpcklbw   xmm6, zero
-    punpcklbw   xmm7, zero
-    punpcklbw   xmm2, zero
-    punpcklbw   xmm5, zero
-    punpcklbw   xmm3, zero
-    punpcklbw   xmm4, zero
-
-    pmullw      xmm0, k0
-    pmullw      xmm1, k1
-    pmullw      xmm6, k6
-    pmullw      xmm7, k7
-    pmullw      xmm2, k2
-    pmullw      xmm5, k5
-    pmullw      xmm3, k3
-    pmullw      xmm4, k4
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm7
-    paddsw      xmm0, xmm2
-    paddsw      xmm0, xmm5
-    paddsw      xmm0, xmm3
-    paddsw      xmm0, xmm4
-
-    paddsw      xmm0, krd                   ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi + %2]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d4_v8_sse2)
-sym(aom_filter_block1d4_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movd        xmm0, [rsi]                 ;load src: row 0
-    movd        xmm1, [rsi + rax]           ;1
-    movd        xmm6, [rsi + rdx * 2]       ;6
-    lea         rsi,  [rsi + rax]
-    movd        xmm7, [rsi + rdx * 2]       ;7
-    movd        xmm2, [rsi + rax]           ;2
-    movd        xmm3, [rsi + rax * 2]       ;3
-    movd        xmm4, [rsi + rdx]           ;4
-    movd        xmm5, [rsi + rax * 4]       ;5
-
-    APPLY_FILTER_4 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d8_v8_sse2)
-sym(aom_filter_block1d8_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-
-    lea         rdi, [rdi + rbx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-;    unsigned char *src_ptr,
-;    unsigned int   src_pitch,
-;    unsigned char *output_ptr,
-;    unsigned int   out_pitch,
-;    unsigned int   output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d16_v8_sse2)
-sym(aom_filter_block1d16_v8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rbx
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rbx, DWORD PTR arg(3)       ;out_pitch
-    lea         rdx, [rax + rax * 2]
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    LOAD_VERT_8 0
-    APPLY_FILTER_8 0, 0
-    sub         rsi, rax
-
-    LOAD_VERT_8 8
-    APPLY_FILTER_8 0, 8
-    add         rdi, rbx
-
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-    pop rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d4_h8_sse2)
-sym(aom_filter_block1d4_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 6
-    %define k0k1 [rsp + 16 * 0]
-    %define k2k3 [rsp + 16 * 1]
-    %define k5k4 [rsp + 16 * 2]
-    %define k6k7 [rsp + 16 * 3]
-    %define krd [rsp + 16 * 4]
-    %define zero [rsp + 16 * 5]
-
-    GET_FILTERS_4
-
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm3, 3
-    psrldq      xmm5, 5
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_4 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 6
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d8_h8_sse2)
-sym(aom_filter_block1d8_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-;    unsigned char  *src_ptr,
-;    unsigned int    src_pixels_per_line,
-;    unsigned char  *output_ptr,
-;    unsigned int    output_pitch,
-;    unsigned int    output_height,
-;    short *filter
-;)
-globalsym(aom_filter_block1d16_h8_sse2)
-sym(aom_filter_block1d16_h8_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 16 * 10
-    %define k0 [rsp + 16 * 0]
-    %define k1 [rsp + 16 * 1]
-    %define k2 [rsp + 16 * 2]
-    %define k3 [rsp + 16 * 3]
-    %define k4 [rsp + 16 * 4]
-    %define k5 [rsp + 16 * 5]
-    %define k6 [rsp + 16 * 6]
-    %define k7 [rsp + 16 * 7]
-    %define krd [rsp + 16 * 8]
-    %define zero [rsp + 16 * 9]
-
-    GET_FILTERS
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-
-.loop:
-    movdqu      xmm0,   [rsi - 3]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 0
-
-    movdqu      xmm0,   [rsi + 5]           ;load src
-
-    movdqa      xmm1, xmm0
-    movdqa      xmm6, xmm0
-    movdqa      xmm7, xmm0
-    movdqa      xmm2, xmm0
-    movdqa      xmm5, xmm0
-    movdqa      xmm3, xmm0
-    movdqa      xmm4, xmm0
-
-    psrldq      xmm1, 1
-    psrldq      xmm6, 6
-    psrldq      xmm7, 7
-    psrldq      xmm2, 2
-    psrldq      xmm5, 5
-    psrldq      xmm3, 3
-    psrldq      xmm4, 4
-
-    APPLY_FILTER_8 0, 8
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-    jnz         .loop
-
-    add rsp, 16 * 10
-    pop rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
deleted file mode 100644
index 90dd55a4be..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm3, [rdx]                 ;load filters
-    pshuflw     xmm4, xmm3, 11111111b       ;k3
-    psrldq      xmm3, 8
-    pshuflw     xmm3, xmm3, 0b              ;k4
-    punpcklqdq  xmm4, xmm3                  ;k3k4
-
-    movq        xmm3, rcx                   ;rounding
-    pshufd      xmm3, xmm3, 0
-
-    pxor        xmm2, xmm2
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
-    punpckldq   xmm0, xmm1                  ;two row in one register
-    punpcklbw   xmm0, xmm2                  ;unpack to word
-    pmullw      xmm0, xmm4                  ;multiply the filter factors
-
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 8
-    paddsw      xmm0, xmm1
-
-    paddsw      xmm0, xmm3                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack to byte
-
-%if %1
-    movd        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-
-    movd        [rdi], xmm0
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro GET_PARAM 0
-    mov         rdx, arg(5)                 ;filter ptr
-    mov         rsi, arg(0)                 ;src_ptr
-    mov         rdi, arg(2)                 ;output_ptr
-    mov         rcx, 0x0400040
-
-    movdqa      xmm7, [rdx]                 ;load filters
-
-    pshuflw     xmm6, xmm7, 11111111b       ;k3
-    pshufhw     xmm7, xmm7, 0b              ;k4
-    punpcklwd   xmm6, xmm6
-    punpckhwd   xmm7, xmm7
-
-    movq        xmm4, rcx                   ;rounding
-    pshufd      xmm4, xmm4, 0
-
-    pxor        xmm5, xmm5
-
-    movsxd      rax, DWORD PTR arg(1)       ;pixels_per_line
-    movsxd      rdx, DWORD PTR arg(3)       ;out_pitch
-    movsxd      rcx, DWORD PTR arg(4)       ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    paddsw      xmm0, xmm1
-    paddsw      xmm0, xmm4                  ;rounding
-    psraw       xmm0, 7                     ;shift
-    packuswb    xmm0, xmm0                  ;pack back to byte
-%if %1
-    movq        xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movq        [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-%macro APPLY_FILTER_16 1
-    punpcklbw   xmm0, xmm5
-    punpcklbw   xmm1, xmm5
-    punpckhbw   xmm2, xmm5
-    punpckhbw   xmm3, xmm5
-
-    pmullw      xmm0, xmm6
-    pmullw      xmm1, xmm7
-    pmullw      xmm2, xmm6
-    pmullw      xmm3, xmm7
-
-    paddsw      xmm0, xmm1
-    paddsw      xmm2, xmm3
-
-    paddsw      xmm0, xmm4                  ;rounding
-    paddsw      xmm2, xmm4
-    psraw       xmm0, 7                     ;shift
-    psraw       xmm2, 7
-    packuswb    xmm0, xmm2                  ;pack back to byte
-%if %1
-    movdqu      xmm1, [rdi]
-    pavgb       xmm0, xmm1
-%endif
-    movdqu      [rdi], xmm0                 ;store the result
-
-    lea         rsi, [rsi + rax]
-    lea         rdi, [rdi + rdx]
-    dec         rcx
-%endm
-
-SECTION .text
-
-globalsym(aom_filter_block1d4_v2_sse2)
-sym(aom_filter_block1d4_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movd        xmm0, [rsi]                 ;load src
-    movd        xmm1, [rsi + rax]
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-globalsym(aom_filter_block1d8_v2_sse2)
-sym(aom_filter_block1d8_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movq        xmm0, [rsi]                 ;0
-    movq        xmm1, [rsi + rax]           ;1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-globalsym(aom_filter_block1d16_v2_sse2)
-sym(aom_filter_block1d16_v2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu        xmm0, [rsi]               ;0
-    movdqu        xmm1, [rsi + rax]         ;1
-    movdqa        xmm2, xmm0
-    movdqa        xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-globalsym(aom_filter_block1d4_h2_sse2)
-sym(aom_filter_block1d4_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM_4
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_4 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-globalsym(aom_filter_block1d8_h2_sse2)
-sym(aom_filter_block1d8_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0, [rsi]                 ;load src
-    movdqa      xmm1, xmm0
-    psrldq      xmm1, 1
-
-    APPLY_FILTER_8 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-globalsym(aom_filter_block1d16_h2_sse2)
-sym(aom_filter_block1d16_h2_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    GET_PARAM
-.loop:
-    movdqu      xmm0,   [rsi]               ;load src
-    movdqu      xmm1,   [rsi + 1]
-    movdqa      xmm2, xmm0
-    movdqa      xmm3, xmm1
-
-    APPLY_FILTER_16 0
-    jnz         .loop
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
index 9ab9143eee..0b552b704b 100644
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -133,7 +133,7 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
   return (avg + 32) >> 6;
 }
 
-void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
   __m128i sum0, sum1, s0, s1, s2, s3, u0;
   u0 = _mm_setzero_si128();
   s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7ee8ba330e..e1db3b950c 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -30,6 +30,7 @@
 #define SUB_EPI16 _mm_sub_epi16
 #endif
 
+#if defined(FDCT4x4_2D_HELPER)
 static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
                               __m128i *in1) {
   // Constants
@@ -185,7 +186,9 @@ static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
     }
   }
 }
+#endif  // defined(FDCT4x4_2D_HELPER)
 
+#if defined(FDCT4x4_2D)
 void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   // This 2D transform implements 4 vertical 1D transforms followed
   // by 4 horizontal 1D transforms.  The multiplies and adds are as given
@@ -205,13 +208,16 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
   storeu_output(&in0, output + 0 * 4);
   storeu_output(&in1, output + 2 * 4);
 }
+#endif  // defined(FDCT4x4_2D)
 
+#if defined(FDCT4x4_2D_LP)
 void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
   __m128i in0, in1;
   FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
   _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
   _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
 }
+#endif  // defined(FDCT4x4_2D_LP)
 
 #if CONFIG_INTERNAL_STATS
 void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
index b4ff91d856..21e9e8b282 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -618,9 +618,9 @@ static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
   return (var > 0) ? var : 0;
 }
 
-void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
-                                const uint16_t *ref, int ref_stride,
-                                uint32_t *sse, int *sum) {
+static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   uint32_t *sse, int *sum) {
   __m256i v_sum_d = _mm256_setzero_si256();
   __m256i v_sse_d = _mm256_setzero_si256();
   for (int i = 0; i < 8; i += 2) {
@@ -653,9 +653,9 @@ void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
   *sse = _mm_extract_epi32(v_d, 1);
 }
 
-void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
-                                  const uint16_t *ref, int ref_stride,
-                                  uint32_t *sse, int *sum) {
+static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+                                     const uint16_t *ref, int ref_stride,
+                                     uint32_t *sse, int *sum) {
   __m256i v_sum_d = _mm256_setzero_si256();
   __m256i v_sse_d = _mm256_setzero_si256();
   const __m256i one = _mm256_set1_epi16(1);
@@ -703,19 +703,19 @@ static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
   *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
 }
 
-#define VAR_FN(w, h, block_size, shift)                                    \
-  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                         \
-      const uint8_t *src8, int src_stride, const uint8_t *ref8,            \
-      int ref_stride, uint32_t *sse) {                                     \
-    int sum;                                                               \
-    int64_t var;                                                           \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                             \
-    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                             \
-    highbd_10_variance_avx2(                                               \
-        src, src_stride, ref, ref_stride, w, h, sse, &sum,                 \
-        aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
-    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);               \
-    return (var >= 0) ? (uint32_t)var : 0;                                 \
+#define VAR_FN(w, h, block_size, shift)                                        \
+  uint32_t aom_highbd_10_variance##w##x##h##_avx2(                             \
+      const uint8_t *src8, int src_stride, const uint8_t *ref8,                \
+      int ref_stride, uint32_t *sse) {                                         \
+    int sum;                                                                   \
+    int64_t var;                                                               \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8);                                 \
+    uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);                                 \
+    highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                            highbd_calc##block_size##x##block_size##var_avx2,  \
+                            block_size);                                       \
+    var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift);                   \
+    return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
 VAR_FN(128, 128, 16, 14)
@@ -741,6 +741,17 @@ VAR_FN(8, 32, 8, 8)
 
 #undef VAR_FN
 
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+                          highbd_calc16x16var_avx2, 16);
+  return *sse;
+}
+
 #define SSE2_HEIGHT(H)                                                 \
   uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2(               \
       const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
@@ -749,7 +760,7 @@ VAR_FN(8, 32, 8, 8)
 SSE2_HEIGHT(8)
 SSE2_HEIGHT(16)
 
-#undef SSE2_Height
+#undef SSE2_HEIGHT
 
 #define HIGHBD_SUBPIX_VAR(W, H)                                              \
   uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2(                 \
@@ -782,8 +793,8 @@ HIGHBD_SUBPIX_VAR(8, 8)
 
 #undef HIGHBD_SUBPIX_VAR
 
-uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
-                                       uint16_t *src, int sstride, int h) {
+static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                          uint16_t *src, int sstride, int h) {
   uint64_t sum = 0;
   __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
   __m256i src0_8x16, src1_8x16, src_16x16;
@@ -840,8 +851,8 @@ uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
   return sum;
 }
 
-uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
-                                       uint16_t *src, int sstride, int h) {
+static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+                                          uint16_t *src, int sstride, int h) {
   uint64_t sum = 0;
   __m256i src0_8x16, src1_8x16, src_16x16;
   __m256i dst0_8x16, dst1_8x16, dst_16x16;
@@ -897,8 +908,8 @@ uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
          "w=8/4 and h=8/4 must satisfy");
   switch (w) {
-    case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+    case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
     default: assert(0 && "unsupported width"); return -1;
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index e897aab645..2fc2e1c0dd 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -637,8 +637,8 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2(
   }
 }
 
-uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
-                                       uint16_t *src, int sstride, int h) {
+static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                          uint16_t *src, int sstride, int h) {
   uint64_t sum = 0;
   __m128i reg0_4x16, reg1_4x16;
   __m128i src_8x16;
@@ -682,8 +682,8 @@ uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
   return sum;
 }
 
-uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
-                                       uint16_t *src, int sstride, int h) {
+static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+                                          uint16_t *src, int sstride, int h) {
   uint64_t sum = 0;
   __m128i src_8x16;
   __m128i dst_8x16;
@@ -728,8 +728,8 @@ uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
          "w=8/4 and h=8/4 must satisfy");
   switch (w) {
-    case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+    case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
     default: assert(0 && "unsupported width"); return -1;
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
index fd48260c6f..869f880bda 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -940,10 +940,10 @@ static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
   return _mm_unpacklo_epi16((x), _mm_setzero_si128());
 }
 
-void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
-                          const uint8_t *LIBAOM_RESTRICT top_row,
-                          const uint8_t *LIBAOM_RESTRICT left_column, int width,
-                          int height) {
+static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+                                 const uint8_t *LIBAOM_RESTRICT top_row,
+                                 const uint8_t *LIBAOM_RESTRICT left_column,
+                                 int width, int height) {
   const uint8_t *const sm_weights_h = smooth_weights + height - 4;
   const uint8_t *const sm_weights_w = smooth_weights + width - 4;
   const __m128i zero = _mm_setzero_si128();
diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
index 799ce9ef44..d96a9dd23d 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -103,11 +103,12 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   pred = _mm_packus_epi16(pred_l, pred_r);                                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
-void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[4], int a_stride,
-                                const uint8_t *b_ptr, int b_stride,
-                                const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[4]) {
+static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_array[4], int a_stride,
+                                   const uint8_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int height, int inv_mask,
+                                   unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -164,11 +165,12 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
   pred = _mm_packus_epi16(pred, _mm_setzero_si128());                     \
   res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
 
-void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
-                                const uint8_t *ref_array[4], int a_stride,
-                                const uint8_t *b_ptr, int b_stride,
-                                const uint8_t *m_ptr, int m_stride, int height,
-                                int inv_mask, unsigned sad_array[4]) {
+static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+                                   const uint8_t *ref_array[4], int a_stride,
+                                   const uint8_t *b_ptr, int b_stride,
+                                   const uint8_t *m_ptr, int m_stride,
+                                   int height, int inv_mask,
+                                   unsigned sad_array[4]) {
   const uint8_t *ref0 = ref_array[0];
   const uint8_t *ref1 = ref_array[1];
   const uint8_t *ref2 = ref_array[2];
@@ -224,22 +226,22 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
                         msk_stride, m, n, inv_mask, sad_array);                \
   }
 
-#define MASKSAD8XN_SSSE3(n)                                                   \
-  void aom_masked_sad8x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
-    aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
-                               8, msk, msk_stride, n, inv_mask, sad_array);   \
+#define MASKSAD8XN_SSSE3(n)                                                  \
+  void aom_masked_sad8x##n##x4d_ssse3(                                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
+    masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \
+                           msk, msk_stride, n, inv_mask, sad_array);         \
   }
 
-#define MASKSAD4XN_SSSE3(n)                                                   \
-  void aom_masked_sad4x##n##x4d_ssse3(                                        \
-      const uint8_t *src, int src_stride, const uint8_t *ref[4],              \
-      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,         \
-      int msk_stride, int inv_mask, unsigned sad_array[4]) {                  \
-    aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
-                               4, msk, msk_stride, n, inv_mask, sad_array);   \
+#define MASKSAD4XN_SSSE3(n)                                                  \
+  void aom_masked_sad4x##n##x4d_ssse3(                                       \
+      const uint8_t *src, int src_stride, const uint8_t *ref[4],             \
+      int ref_stride, const uint8_t *second_pred, const uint8_t *msk,        \
+      int msk_stride, int inv_mask, unsigned sad_array[4]) {                 \
+    masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \
+                           msk, msk_stride, n, inv_mask, sad_array);         \
   }
 
 MASKSADMXN_SSSE3(128, 128)
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm
index d1d8373456..f424ce01dd 100644
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm
@@ -15,21 +15,6 @@
 
 SECTION_RODATA
 pw_8: times  8 dw  8
-bilin_filter_m_sse2: times  8 dw 16
-                     times  8 dw  0
-                     times  8 dw 14
-                     times  8 dw  2
-                     times  8 dw 12
-                     times  8 dw  4
-                     times  8 dw 10
-                     times  8 dw  6
-                     times 16 dw  8
-                     times  8 dw  6
-                     times  8 dw 10
-                     times  8 dw  4
-                     times  8 dw 12
-                     times  8 dw  2
-                     times  8 dw 14
 
 bilin_filter_m_ssse3: times  8 db 16,  0
                       times  8 db 14,  2
@@ -109,9 +94,6 @@ SECTION .text
 %if cpuflag(ssse3)
 %define bilin_filter_m bilin_filter_m_ssse3
 %define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
 %endif
 ; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
 ; 11, not 13, if the registers are ordered correctly. May make a minor speed
@@ -1449,21 +1431,11 @@ SECTION .text
 ; location in the sse/2 version, rather than duplicating that code in the
 ; binary.
 
-INIT_XMM sse2
-SUBPEL_VARIANCE  4
-SUBPEL_VARIANCE  8
-SUBPEL_VARIANCE 16
-
 INIT_XMM ssse3
 SUBPEL_VARIANCE  4
 SUBPEL_VARIANCE  8
 SUBPEL_VARIANCE 16
 
-INIT_XMM sse2
-SUBPEL_VARIANCE  4, 1
-SUBPEL_VARIANCE  8, 1
-SUBPEL_VARIANCE 16, 1
-
 INIT_XMM ssse3
 SUBPEL_VARIANCE  4, 1
 SUBPEL_VARIANCE  8, 1
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index 6744ec51d0..74318de2e5 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -46,6 +46,25 @@ static INLINE __m128i xx_loadu_128(const void *a) {
   return _mm_loadu_si128((const __m128i *)a);
 }
 
+
+// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
+// manually on older compilers.
+#if !defined(__clang__) && __GNUC_MAJOR__ < 9
+static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+  __m64 hi_, lo_;
+  memcpy(&hi_, hi, sizeof(hi_));
+  memcpy(&lo_, lo, sizeof(lo_));
+  return _mm_set_epi64(hi_, lo_);
+}
+#else
+// Load 64 bits from each of hi and low, and pack into an SSE register
+// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
+// the strict aliasing rule, this takes a different approach
+static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+  return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+}
+#endif
+
 static INLINE void xx_storel_32(void *const a, const __m128i v) {
   const int val = _mm_cvtsi128_si32(v);
   memcpy(a, &val, sizeof(val));
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index b729e5f410..7548d4d4f4 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -43,6 +43,16 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) {
   _mm256_storeu_si256((__m256i *)a, v);
 }
 
+// Fill an AVX register using an interleaved pair of values, ie. set the
+// 16 channels to {a, b} repeated 8 times, using the same channel ordering
+// as when a register is stored to / loaded from memory.
+//
+// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
+// instruction
+static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
+  return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
 // The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
 // compilers. The following function is equivalent to _mm256_set1_epi64x()
 // acting on a 32-bit integer.
@@ -61,11 +71,26 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
   return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
 }
 
+#define GCC_VERSION (__GNUC__ * 10000 \
+                     + __GNUC_MINOR__ * 100 \
+                     + __GNUC_PATCHLEVEL__)
+
+// _mm256_loadu2_m128i has been introduced in GCC 10.1
+#if !defined(__clang__) && GCC_VERSION < 101000
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+  __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+  __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+  return _mm256_set_m128i(mhi, mlo);
+}
+#else
 static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
   __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
   __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
   return yy_set_m128i(mhi, mlo);
 }
+#endif
+
+#undef GCC_VERSION
 
 static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
   _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 046d6f10f8..0f872fc392 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -518,8 +518,8 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
   }
 }
 
-uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
+static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                   int sstride, int h) {
   uint64_t sum = 0;
   __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
   __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
@@ -575,8 +575,9 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
 // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
 // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
 // buffer, thus dstride is a frame level stride.
-uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                     int src_blk_stride, int h) {
+static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride,
+                                        uint16_t *src, int src_blk_stride,
+                                        int h) {
   uint64_t sum = 0;
   __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
   __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
@@ -665,8 +666,8 @@ uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   return sum;
 }
 
-uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
+static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+                                   int sstride, int h) {
   uint64_t sum = 0;
   __m128i dst0_8x8, dst1_8x8, dst3_16x8;
   __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
@@ -715,8 +716,9 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
 // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
 // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
 // buffer, thus dstride is a frame level stride.
-uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
-                                     int src_blk_stride, int h) {
+static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride,
+                                        uint16_t *src, int src_blk_stride,
+                                        int h) {
   uint64_t sum = 0;
   __m128i dst0_16x8, dst1_16x8;
   __m256i dst0_16x16, dst1_16x16;
@@ -780,8 +782,8 @@ uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
          "w=8/4 and h=8/4 must be satisfied");
   switch (w) {
-    case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+    case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+    case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
     default: assert(0 && "unsupported width"); return -1;
   }
 }
@@ -795,8 +797,8 @@ uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
          "w=8/4 and h=8/4 must be satisfied");
   switch (w) {
-    case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
-    case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+    case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+    case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
     default: assert(0 && "unsupported width"); return -1;
   }
 }
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
index 9e9e70ea01..57a1cee781 100644
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -648,7 +648,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2)
 #endif
 
 #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height)                       \
-  int aom_sub_pixel_avg_variance32x##height##_imp_avx2(                       \
+  static int sub_pixel_avg_variance32x##height##_imp_avx2(                    \
       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
       unsigned int *sse) {                                                    \
@@ -876,7 +876,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2)
       const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
       const uint8_t *dst, int dst_stride, unsigned int *sse,                  \
       const uint8_t *sec_ptr) {                                               \
-    const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2(         \
+    const int sum = sub_pixel_avg_variance32x##height##_imp_avx2(             \
         src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32,    \
         sse);                                                                 \
     return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height));   \
@@ -899,7 +899,7 @@ MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
       const uint8_t *sec_ptr = sec;                                       \
       for (int j = 0; j < (h / hf); ++j) {                                \
         unsigned int sse2;                                                \
-        const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+        const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2(     \
             src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
             sec_ptr, w, &sse2);                                           \
         dst_ptr += hf * dst_stride;                                       \
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index faec9cf73d..81b30072a5 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -415,7 +415,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
   DECL(8, opt);    \
   DECL(16, opt)
 
-DECLS(sse2);
 DECLS(ssse3);
 #undef DECLS
 #undef DECL
@@ -492,7 +491,6 @@ DECLS(ssse3);
   FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
 #endif
 
-FNS(sse2)
 FNS(ssse3)
 
 #undef FNS
@@ -510,7 +508,6 @@ FNS(ssse3)
   DECL(8, opt);    \
   DECL(16, opt)
 
-DECLS(sse2);
 DECLS(ssse3);
 #undef DECL
 #undef DECLS
@@ -591,7 +588,6 @@ DECLS(ssse3);
   FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
 #endif
 
-FNS(sse2)
 FNS(ssse3)
 
 #undef FNS
@@ -710,8 +706,8 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
   }
 }
 
-uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
+static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                   int sstride, int h) {
   uint64_t sum = 0;
   __m128i dst0_8x8, dst1_8x8, dst_16x8;
   __m128i src0_16x4, src1_16x4, src_16x8;
@@ -744,8 +740,8 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
   return sum;
 }
 
-uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
-                                int sstride, int h) {
+static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+                                   int sstride, int h) {
   uint64_t sum = 0;
   __m128i dst_8x8, dst_16x8;
   __m128i src_16x8;
@@ -781,8 +777,8 @@ uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
   assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
          "w=8/4 and h=8/4 must satisfy");
   switch (w) {
-    case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
-    case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+    case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+    case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
     default: assert(0 && "unsupported width"); return -1;
   }
 }
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-05-15 03:35:49 +0000
commit	d8bbc7858622b6d9c278469aab701ca0b609cddf (patch)
tree	eff41dc61d9f714852212739e6b3738b82a2af87 /third_party/aom/aom_dsp/x86
parent	Releasing progress-linux version 125.0.3-1~progress7.99u1. (diff)
download	firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.tar.xz firefox-d8bbc7858622b6d9c278469aab701ca0b609cddf.zip