summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:34:50 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-15 03:34:50 +0000
commitdef92d1b8e9d373e2f6f27c366d578d97d8960c6 (patch)
tree2ef34b9ad8bb9a9220e05d60352558b15f513894 /third_party/aom/aom_dsp/x86
parentAdding debian version 125.0.3-1. (diff)
downloadfirefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.tar.xz
firefox-def92d1b8e9d373e2f6f27c366d578d97d8960c6.zip
Merging upstream version 126.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c34
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c569
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm615
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm295
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse2.c2
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h6
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_avx2.c63
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c12
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.c8
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c50
-rw-r--r--third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm (renamed from third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm)28
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h19
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms_avx2.h25
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c26
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_avx2.c6
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c16
16 files changed, 147 insertions, 1627 deletions
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
index b08ec2546b..6c7fdd6eb1 100644
--- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -15,40 +15,6 @@
#include "aom_dsp/x86/convolve.h"
#if HAVE_SSE2
-filter8_1dfunction aom_filter_block1d16_v8_sse2;
-filter8_1dfunction aom_filter_block1d16_h8_sse2;
-filter8_1dfunction aom_filter_block1d8_v8_sse2;
-filter8_1dfunction aom_filter_block1d8_h8_sse2;
-filter8_1dfunction aom_filter_block1d4_v8_sse2;
-filter8_1dfunction aom_filter_block1d4_h8_sse2;
-filter8_1dfunction aom_filter_block1d16_v4_sse2;
-filter8_1dfunction aom_filter_block1d16_h4_sse2;
-
-filter8_1dfunction aom_filter_block1d8_h4_sse2;
-filter8_1dfunction aom_filter_block1d8_v4_sse2;
-filter8_1dfunction aom_filter_block1d4_h4_sse2;
-filter8_1dfunction aom_filter_block1d4_v4_sse2;
-
-filter8_1dfunction aom_filter_block1d16_v2_sse2;
-filter8_1dfunction aom_filter_block1d16_h2_sse2;
-filter8_1dfunction aom_filter_block1d8_v2_sse2;
-filter8_1dfunction aom_filter_block1d8_h2_sse2;
-filter8_1dfunction aom_filter_block1d4_v2_sse2;
-filter8_1dfunction aom_filter_block1d4_h2_sse2;
-
-// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
-
#if CONFIG_AV1_HIGHBITDEPTH
highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
deleted file mode 100644
index 5c36b68727..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
+++ /dev/null
@@ -1,569 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h> // SSE2
-
-#include "config/aom_dsp_rtcd.h"
-#include "aom_dsp/x86/convolve.h"
-#include "aom_ports/mem.h"
-
-void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr, ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i addFilterReg32;
- __m128i secondFilters, thirdFilters;
- __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
- srcRegFilt32b2_2;
- __m128i srcReg32b1, srcReg32b2;
- unsigned int i;
- src_ptr -= 3;
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
-
- for (i = output_height; i > 0; i -= 1) {
- srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
- __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
- __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
- __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
- __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
- __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
- __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
- srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
- __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
- __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
- __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
- __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
- d1 = _mm_madd_epi16(ss_1_2, secondFilters);
- d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
- srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
- __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
- __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
- srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
- // reading stride of the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
-
- ss_2 = _mm_srli_si128(srcReg32b2, 2);
- ss_4 = _mm_srli_si128(srcReg32b2, 4);
- ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
- ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
- d1 = _mm_madd_epi16(ss_1_1, secondFilters);
- d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
- srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
-
- ss_1 = _mm_srli_si128(srcReg32b2, 3);
- ss_3 = _mm_srli_si128(srcReg32b2, 5);
- ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
- ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
- d1 = _mm_madd_epi16(ss_1_2, secondFilters);
- d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
- srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
-
- res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
- res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
- srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
- srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
-
- src_ptr += src_pixels_per_line;
-
- _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *output_ptr, ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
- __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
- __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
- __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
- __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
- __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
- __m128i resReg23_45, resReg34_56;
- __m128i addFilterReg32, secondFilters, thirdFilters;
- __m128i tmp_0, tmp_1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
-
- // multiply the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
- srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
- __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
- __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
- __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
- __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
-
- srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
- srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
- __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
- __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
- __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
- __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
-
- for (i = output_height; i > 1; i -= 2) {
- srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-
- srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
- srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
-
- srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-
- srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
- srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
-
- // multiply 2 adjacent elements with the filter and add the result
-
- tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
- resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
- resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
- __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
- resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
- __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
- resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- // add and saturate the results together
- resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
- resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
- // multiply 2 adjacent elements with the filter and add the result
-
- tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
- resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
- tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
- resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
- __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
- resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
- __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
- resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
-
- // add and saturate the results together
- resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
- resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
-
- // shift by 6 bit each 16 bit
- resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
- resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
- resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
- resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
- resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
- resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
- resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
- resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
- resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
-
- src_ptr += src_stride;
-
- _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
- _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- resReg23_lo_1 = resReg45_lo_1;
- resReg23_lo_2 = resReg45_lo_2;
- resReg23_hi_1 = resReg45_hi_1;
- resReg23_hi_2 = resReg45_hi_2;
- resReg34_lo_1 = resReg56_lo_1;
- resReg34_lo_2 = resReg56_lo_2;
- resReg34_hi_1 = resReg56_hi_1;
- resReg34_hi_2 = resReg56_hi_2;
- srcReg4 = srcReg6;
- }
-}
-
-void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr, ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i addFilterReg32;
- __m128i secondFilters, thirdFilters;
- __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
- __m128i srcReg32b1;
- unsigned int i;
- src_ptr -= 3;
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
-
- for (i = output_height; i > 0; i -= 1) {
- srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
- __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
- __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
- ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
- ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
- __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
- __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
- srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
- __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
- __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
- ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
- ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
- d1 = _mm_madd_epi16(ss_3, secondFilters);
- d2 = _mm_madd_epi16(ss_5, thirdFilters);
- srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
-
- __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
- __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
- srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
- src_ptr += src_pixels_per_line;
-
- _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *output_ptr, ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
- __m128i srcReg23_lo, srcReg34_lo;
- __m128i srcReg45_lo, srcReg56_lo;
- __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
- __m128i resReg23_45_lo, resReg34_56_lo;
- __m128i resReg23_45, resReg34_56;
- __m128i addFilterReg32, secondFilters, thirdFilters;
- __m128i tmp_0, tmp_1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
-
- // multiply the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
- __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
- __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
-
- srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
- __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
- __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
-
- for (i = output_height; i > 1; i -= 2) {
- srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
-
- srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
- srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
- // multiply 2 adjacent elements with the filter and add the result
-
- tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
- resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
- resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
- __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
- resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
- __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
- tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
- resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
-
- // add and saturate the results together
- resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
- resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
-
- // shift by 6 bit each 16 bit
- resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
- resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
- resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
- resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
- resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
-
- src_ptr += src_stride;
-
- _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
- _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- resReg23_lo_1 = resReg45_lo_1;
- resReg23_lo_2 = resReg45_lo_2;
- resReg34_lo_1 = resReg56_lo_1;
- resReg34_lo_2 = resReg56_lo_2;
- srcReg4 = srcReg6;
- }
-}
-
-void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
- ptrdiff_t src_pixels_per_line,
- uint8_t *output_ptr, ptrdiff_t output_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i addFilterReg32;
- __m128i secondFilters, thirdFilters;
- __m128i srcRegFilt32b1_1;
- __m128i srcReg32b1;
- unsigned int i;
- src_ptr -= 3;
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
-
- for (i = output_height; i > 0; i -= 1) {
- srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
-
- __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
- __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
- __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
- __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
-
- ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
- ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
- ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
- ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
-
- __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
- __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
-
- __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
- __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
- srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
-
- srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
-
- // shift by 6 bit each 16 bit
- srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
- srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve result
- srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
-
- src_ptr += src_pixels_per_line;
-
- *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
-
- output_ptr += output_pitch;
- }
-}
-
-void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *output_ptr, ptrdiff_t out_pitch,
- uint32_t output_height,
- const int16_t *filter) {
- __m128i filtersReg;
- __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
- __m128i srcReg23, srcReg34, srcReg45, srcReg56;
- __m128i resReg23_34, resReg45_56;
- __m128i resReg23_34_45_56;
- __m128i addFilterReg32, secondFilters, thirdFilters;
- __m128i tmp_0, tmp_1;
- unsigned int i;
- ptrdiff_t src_stride, dst_stride;
-
- addFilterReg32 = _mm_set1_epi16(32);
- filtersReg = _mm_loadu_si128((const __m128i *)filter);
- filtersReg = _mm_srai_epi16(filtersReg, 1);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
-
- secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
- thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
-
- // multiply the size of the source and destination stride by two
- src_stride = src_pitch << 1;
- dst_stride = out_pitch << 1;
-
- srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
- __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
-
- srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
- __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
-
- for (i = output_height; i > 1; i -= 2) {
- srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
- srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
- srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
-
- // multiply 2 adjacent elements with the filter and add the result
- tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
- tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
- resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
-
- __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
- __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
-
- tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
- tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
- resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
-
- // add and saturate the results together
- resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
-
- // shift by 6 bit each 16 bit
- resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
- resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
-
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- resReg23_34_45_56 =
- _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
-
- src_ptr += src_stride;
-
- *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
- *((int *)(output_ptr + out_pitch)) =
- _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
-
- output_ptr += dst_stride;
-
- // save part of the registers for next strides
- resReg23 = resReg45;
- resReg34 = resReg56;
- srcReg4 = srcReg6;
- }
-}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
deleted file mode 100644
index 640c5b2416..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
+++ /dev/null
@@ -1,615 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-
-%include "aom_ports/x86_abi_support.asm"
-
-;Note: tap3 and tap4 have to be applied and added after other taps to avoid
-;overflow.
-
-%macro GET_FILTERS_4 0
- mov rdx, arg(5) ;filter ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- psrldq xmm7, 8
- pshuflw xmm4, xmm7, 0b ;k4
- pshuflw xmm5, xmm7, 01010101b ;k5
- pshuflw xmm6, xmm7, 10101010b ;k6
- pshuflw xmm7, xmm7, 11111111b ;k7
-
- punpcklqdq xmm0, xmm1
- punpcklqdq xmm2, xmm3
- punpcklqdq xmm5, xmm4
- punpcklqdq xmm6, xmm7
-
- movdqa k0k1, xmm0
- movdqa k2k3, xmm2
- movdqa k5k4, xmm5
- movdqa k6k7, xmm6
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro APPLY_FILTER_4 1
- punpckldq xmm0, xmm1 ;two row in one register
- punpckldq xmm6, xmm7
- punpckldq xmm2, xmm3
- punpckldq xmm5, xmm4
-
- punpcklbw xmm0, zero ;unpack to word
- punpcklbw xmm6, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
-
- pmullw xmm0, k0k1 ;multiply the filter factors
- pmullw xmm6, k6k7
- pmullw xmm2, k2k3
- pmullw xmm5, k5k4
-
- paddsw xmm0, xmm6 ;sum
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- psrldq xmm2, 8
- paddsw xmm0, xmm5
- psrldq xmm5, 8
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movd [rdi], xmm0
-%endm
-
-%macro GET_FILTERS 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
- pshuflw xmm0, xmm7, 0b ;k0
- pshuflw xmm1, xmm7, 01010101b ;k1
- pshuflw xmm2, xmm7, 10101010b ;k2
- pshuflw xmm3, xmm7, 11111111b ;k3
- pshufhw xmm4, xmm7, 0b ;k4
- pshufhw xmm5, xmm7, 01010101b ;k5
- pshufhw xmm6, xmm7, 10101010b ;k6
- pshufhw xmm7, xmm7, 11111111b ;k7
-
- punpcklwd xmm0, xmm0
- punpcklwd xmm1, xmm1
- punpcklwd xmm2, xmm2
- punpcklwd xmm3, xmm3
- punpckhwd xmm4, xmm4
- punpckhwd xmm5, xmm5
- punpckhwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movdqa k0, xmm0 ;store filter factors on stack
- movdqa k1, xmm1
- movdqa k2, xmm2
- movdqa k3, xmm3
- movdqa k4, xmm4
- movdqa k5, xmm5
- movdqa k6, xmm6
- movdqa k7, xmm7
-
- movq xmm6, rcx
- pshufd xmm6, xmm6, 0
- movdqa krd, xmm6 ;rounding
-
- pxor xmm7, xmm7
- movdqa zero, xmm7
-%endm
-
-%macro LOAD_VERT_8 1
- movq xmm0, [rsi + %1] ;0
- movq xmm1, [rsi + rax + %1] ;1
- movq xmm6, [rsi + rdx * 2 + %1] ;6
- lea rsi, [rsi + rax]
- movq xmm7, [rsi + rdx * 2 + %1] ;7
- movq xmm2, [rsi + rax + %1] ;2
- movq xmm3, [rsi + rax * 2 + %1] ;3
- movq xmm4, [rsi + rdx + %1] ;4
- movq xmm5, [rsi + rax * 4 + %1] ;5
-%endm
-
-%macro APPLY_FILTER_8 2
- punpcklbw xmm0, zero
- punpcklbw xmm1, zero
- punpcklbw xmm6, zero
- punpcklbw xmm7, zero
- punpcklbw xmm2, zero
- punpcklbw xmm5, zero
- punpcklbw xmm3, zero
- punpcklbw xmm4, zero
-
- pmullw xmm0, k0
- pmullw xmm1, k1
- pmullw xmm6, k6
- pmullw xmm7, k7
- pmullw xmm2, k2
- pmullw xmm5, k5
- pmullw xmm3, k3
- pmullw xmm4, k4
-
- paddsw xmm0, xmm1
- paddsw xmm0, xmm6
- paddsw xmm0, xmm7
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
- paddsw xmm0, xmm3
- paddsw xmm0, xmm4
-
- paddsw xmm0, krd ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi + %2]
- pavgb xmm0, xmm1
-%endif
- movq [rdi + %2], xmm0
-%endm
-
-SECTION .text
-
-;void aom_filter_block1d4_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d4_v8_sse2)
-sym(aom_filter_block1d4_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movd xmm0, [rsi] ;load src: row 0
- movd xmm1, [rsi + rax] ;1
- movd xmm6, [rsi + rdx * 2] ;6
- lea rsi, [rsi + rax]
- movd xmm7, [rsi + rdx * 2] ;7
- movd xmm2, [rsi + rax] ;2
- movd xmm3, [rsi + rax * 2] ;3
- movd xmm4, [rsi + rdx] ;4
- movd xmm5, [rsi + rax * 4] ;5
-
- APPLY_FILTER_4 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d8_v8_sse2)
-sym(aom_filter_block1d8_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
-
- lea rdi, [rdi + rbx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_v8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d16_v8_sse2)
-sym(aom_filter_block1d16_v8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rbx, DWORD PTR arg(3) ;out_pitch
- lea rdx, [rax + rax * 2]
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- LOAD_VERT_8 0
- APPLY_FILTER_8 0, 0
- sub rsi, rax
-
- LOAD_VERT_8 8
- APPLY_FILTER_8 0, 8
- add rdi, rbx
-
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d4_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d4_h8_sse2)
-sym(aom_filter_block1d4_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 6
- %define k0k1 [rsp + 16 * 0]
- %define k2k3 [rsp + 16 * 1]
- %define k5k4 [rsp + 16 * 2]
- %define k6k7 [rsp + 16 * 3]
- %define krd [rsp + 16 * 4]
- %define zero [rsp + 16 * 5]
-
- GET_FILTERS_4
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm3, xmm0
- movdqa xmm5, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm3, 3
- psrldq xmm5, 5
- psrldq xmm4, 4
-
- APPLY_FILTER_4 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 6
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d8_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d8_h8_sse2)
-sym(aom_filter_block1d8_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void aom_filter_block1d16_h8_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-globalsym(aom_filter_block1d16_h8_sse2)
-sym(aom_filter_block1d16_h8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 16 * 10
- %define k0 [rsp + 16 * 0]
- %define k1 [rsp + 16 * 1]
- %define k2 [rsp + 16 * 2]
- %define k3 [rsp + 16 * 3]
- %define k4 [rsp + 16 * 4]
- %define k5 [rsp + 16 * 5]
- %define k6 [rsp + 16 * 6]
- %define k7 [rsp + 16 * 7]
- %define krd [rsp + 16 * 8]
- %define zero [rsp + 16 * 9]
-
- GET_FILTERS
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-
-.loop:
- movdqu xmm0, [rsi - 3] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 0
-
- movdqu xmm0, [rsi + 5] ;load src
-
- movdqa xmm1, xmm0
- movdqa xmm6, xmm0
- movdqa xmm7, xmm0
- movdqa xmm2, xmm0
- movdqa xmm5, xmm0
- movdqa xmm3, xmm0
- movdqa xmm4, xmm0
-
- psrldq xmm1, 1
- psrldq xmm6, 6
- psrldq xmm7, 7
- psrldq xmm2, 2
- psrldq xmm5, 5
- psrldq xmm3, 3
- psrldq xmm4, 4
-
- APPLY_FILTER_8 0, 8
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .loop
-
- add rsp, 16 * 10
- pop rsp
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
deleted file mode 100644
index 90dd55a4be..0000000000
--- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
+++ /dev/null
@@ -1,295 +0,0 @@
-;
-; Copyright (c) 2016, Alliance for Open Media. All rights reserved
-;
-; This source code is subject to the terms of the BSD 2 Clause License and
-; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
-; was not distributed with this source code in the LICENSE file, you can
-; obtain it at www.aomedia.org/license/software. If the Alliance for Open
-; Media Patent License 1.0 was not distributed with this source code in the
-; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
-;
-
-;
-
-%include "aom_ports/x86_abi_support.asm"
-
-%macro GET_PARAM_4 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm3, [rdx] ;load filters
- pshuflw xmm4, xmm3, 11111111b ;k3
- psrldq xmm3, 8
- pshuflw xmm3, xmm3, 0b ;k4
- punpcklqdq xmm4, xmm3 ;k3k4
-
- movq xmm3, rcx ;rounding
- pshufd xmm3, xmm3, 0
-
- pxor xmm2, xmm2
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_4 1
-
- punpckldq xmm0, xmm1 ;two row in one register
- punpcklbw xmm0, xmm2 ;unpack to word
- pmullw xmm0, xmm4 ;multiply the filter factors
-
- movdqa xmm1, xmm0
- psrldq xmm1, 8
- paddsw xmm0, xmm1
-
- paddsw xmm0, xmm3 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack to byte
-
-%if %1
- movd xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
-
- movd [rdi], xmm0
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro GET_PARAM 0
- mov rdx, arg(5) ;filter ptr
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- mov rcx, 0x0400040
-
- movdqa xmm7, [rdx] ;load filters
-
- pshuflw xmm6, xmm7, 11111111b ;k3
- pshufhw xmm7, xmm7, 0b ;k4
- punpcklwd xmm6, xmm6
- punpckhwd xmm7, xmm7
-
- movq xmm4, rcx ;rounding
- pshufd xmm4, xmm4, 0
-
- pxor xmm5, xmm5
-
- movsxd rax, DWORD PTR arg(1) ;pixels_per_line
- movsxd rdx, DWORD PTR arg(3) ;out_pitch
- movsxd rcx, DWORD PTR arg(4) ;output_height
-%endm
-
-%macro APPLY_FILTER_8 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- paddsw xmm0, xmm1
- paddsw xmm0, xmm4 ;rounding
- psraw xmm0, 7 ;shift
- packuswb xmm0, xmm0 ;pack back to byte
-%if %1
- movq xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movq [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-%macro APPLY_FILTER_16 1
- punpcklbw xmm0, xmm5
- punpcklbw xmm1, xmm5
- punpckhbw xmm2, xmm5
- punpckhbw xmm3, xmm5
-
- pmullw xmm0, xmm6
- pmullw xmm1, xmm7
- pmullw xmm2, xmm6
- pmullw xmm3, xmm7
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm3
-
- paddsw xmm0, xmm4 ;rounding
- paddsw xmm2, xmm4
- psraw xmm0, 7 ;shift
- psraw xmm2, 7
- packuswb xmm0, xmm2 ;pack back to byte
-%if %1
- movdqu xmm1, [rdi]
- pavgb xmm0, xmm1
-%endif
- movdqu [rdi], xmm0 ;store the result
-
- lea rsi, [rsi + rax]
- lea rdi, [rdi + rdx]
- dec rcx
-%endm
-
-SECTION .text
-
-globalsym(aom_filter_block1d4_v2_sse2)
-sym(aom_filter_block1d4_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movd xmm0, [rsi] ;load src
- movd xmm1, [rsi + rax]
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-globalsym(aom_filter_block1d8_v2_sse2)
-sym(aom_filter_block1d8_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movq xmm0, [rsi] ;0
- movq xmm1, [rsi + rax] ;1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-globalsym(aom_filter_block1d16_v2_sse2)
-sym(aom_filter_block1d16_v2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;0
- movdqu xmm1, [rsi + rax] ;1
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-globalsym(aom_filter_block1d4_h2_sse2)
-sym(aom_filter_block1d4_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM_4
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_4 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-globalsym(aom_filter_block1d8_h2_sse2)
-sym(aom_filter_block1d8_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqa xmm1, xmm0
- psrldq xmm1, 1
-
- APPLY_FILTER_8 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-globalsym(aom_filter_block1d16_h2_sse2)
-sym(aom_filter_block1d16_h2_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- ; end prolog
-
- GET_PARAM
-.loop:
- movdqu xmm0, [rsi] ;load src
- movdqu xmm1, [rsi + 1]
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
-
- APPLY_FILTER_16 0
- jnz .loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
index 9ab9143eee..0b552b704b 100644
--- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -133,7 +133,7 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
return (avg + 32) >> 6;
}
-void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
__m128i sum0, sum1, s0, s1, s2, s3, u0;
u0 = _mm_setzero_si128();
s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 7ee8ba330e..e1db3b950c 100644
--- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -30,6 +30,7 @@
#define SUB_EPI16 _mm_sub_epi16
#endif
+#if defined(FDCT4x4_2D_HELPER)
static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
__m128i *in1) {
// Constants
@@ -185,7 +186,9 @@ static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
}
}
}
+#endif // defined(FDCT4x4_2D_HELPER)
+#if defined(FDCT4x4_2D)
void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
// This 2D transform implements 4 vertical 1D transforms followed
// by 4 horizontal 1D transforms. The multiplies and adds are as given
@@ -205,13 +208,16 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
storeu_output(&in0, output + 0 * 4);
storeu_output(&in1, output + 2 * 4);
}
+#endif // defined(FDCT4x4_2D)
+#if defined(FDCT4x4_2D_LP)
void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
__m128i in0, in1;
FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
_mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
_mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
}
+#endif // defined(FDCT4x4_2D_LP)
#if CONFIG_INTERNAL_STATS
void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
index b4ff91d856..21e9e8b282 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -618,9 +618,9 @@ static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
return (var > 0) ? var : 0;
}
-void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum) {
+static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
__m256i v_sum_d = _mm256_setzero_si256();
__m256i v_sse_d = _mm256_setzero_si256();
for (int i = 0; i < 8; i += 2) {
@@ -653,9 +653,9 @@ void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
*sse = _mm_extract_epi32(v_d, 1);
}
-void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
- const uint16_t *ref, int ref_stride,
- uint32_t *sse, int *sum) {
+static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
__m256i v_sum_d = _mm256_setzero_si256();
__m256i v_sse_d = _mm256_setzero_si256();
const __m256i one = _mm256_set1_epi16(1);
@@ -703,19 +703,19 @@ static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
*sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
}
-#define VAR_FN(w, h, block_size, shift) \
- uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
- const uint8_t *src8, int src_stride, const uint8_t *ref8, \
- int ref_stride, uint32_t *sse) { \
- int sum; \
- int64_t var; \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
- highbd_10_variance_avx2( \
- src, src_stride, ref, ref_stride, w, h, sse, &sum, \
- aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
- var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
- return (var >= 0) ? (uint32_t)var : 0; \
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ highbd_calc##block_size##x##block_size##var_avx2, \
+ block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
}
VAR_FN(128, 128, 16, 14)
@@ -741,6 +741,17 @@ VAR_FN(8, 32, 8, 8)
#undef VAR_FN
+unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ highbd_calc16x16var_avx2, 16);
+ return *sse;
+}
+
#define SSE2_HEIGHT(H) \
uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \
const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
@@ -749,7 +760,7 @@ VAR_FN(8, 32, 8, 8)
SSE2_HEIGHT(8)
SSE2_HEIGHT(16)
-#undef SSE2_Height
+#undef SSE2_HEIGHT
#define HIGHBD_SUBPIX_VAR(W, H) \
uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
@@ -782,8 +793,8 @@ HIGHBD_SUBPIX_VAR(8, 8)
#undef HIGHBD_SUBPIX_VAR
-uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
- uint16_t *src, int sstride, int h) {
+static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
uint64_t sum = 0;
__m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
__m256i src0_8x16, src1_8x16, src_16x16;
@@ -840,8 +851,8 @@ uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
return sum;
}
-uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
- uint16_t *src, int sstride, int h) {
+static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
uint64_t sum = 0;
__m256i src0_8x16, src1_8x16, src_16x16;
__m256i dst0_8x16, dst1_8x16, dst_16x16;
@@ -897,8 +908,8 @@ uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
"w=8/4 and h=8/4 must satisfy");
switch (w) {
- case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
- case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
default: assert(0 && "unsupported width"); return -1;
}
}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
index e897aab645..2fc2e1c0dd 100644
--- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -637,8 +637,8 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2(
}
}
-uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
- uint16_t *src, int sstride, int h) {
+static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
uint64_t sum = 0;
__m128i reg0_4x16, reg1_4x16;
__m128i src_8x16;
@@ -682,8 +682,8 @@ uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
return sum;
}
-uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
- uint16_t *src, int sstride, int h) {
+static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
uint64_t sum = 0;
__m128i src_8x16;
__m128i dst_8x16;
@@ -728,8 +728,8 @@ uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
"w=8/4 and h=8/4 must satisfy");
switch (w) {
- case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
- case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
default: assert(0 && "unsupported width"); return -1;
}
}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
index fd48260c6f..869f880bda 100644
--- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -940,10 +940,10 @@ static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
return _mm_unpacklo_epi16((x), _mm_setzero_si128());
}
-void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
- const uint8_t *LIBAOM_RESTRICT top_row,
- const uint8_t *LIBAOM_RESTRICT left_column, int width,
- int height) {
+static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column,
+ int width, int height) {
const uint8_t *const sm_weights_h = smooth_weights + height - 4;
const uint8_t *const sm_weights_w = smooth_weights + width - 4;
const __m128i zero = _mm_setzero_si128();
diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
index 799ce9ef44..d96a9dd23d 100644
--- a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
+++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -103,11 +103,12 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
pred = _mm_packus_epi16(pred_l, pred_r); \
res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
-void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_array[4], int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height,
- int inv_mask, unsigned sad_array[4]) {
+static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int inv_mask,
+ unsigned sad_array[4]) {
const uint8_t *ref0 = ref_array[0];
const uint8_t *ref1 = ref_array[1];
const uint8_t *ref2 = ref_array[2];
@@ -164,11 +165,12 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \
res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
-void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
- const uint8_t *ref_array[4], int a_stride,
- const uint8_t *b_ptr, int b_stride,
- const uint8_t *m_ptr, int m_stride, int height,
- int inv_mask, unsigned sad_array[4]) {
+static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int inv_mask,
+ unsigned sad_array[4]) {
const uint8_t *ref0 = ref_array[0];
const uint8_t *ref1 = ref_array[1];
const uint8_t *ref2 = ref_array[2];
@@ -224,22 +226,22 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
msk_stride, m, n, inv_mask, sad_array); \
}
-#define MASKSAD8XN_SSSE3(n) \
- void aom_masked_sad8x##n##x4d_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref[4], \
- int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
- int msk_stride, int inv_mask, unsigned sad_array[4]) { \
- aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
- 8, msk, msk_stride, n, inv_mask, sad_array); \
+#define MASKSAD8XN_SSSE3(n) \
+ void aom_masked_sad8x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \
+ msk, msk_stride, n, inv_mask, sad_array); \
}
-#define MASKSAD4XN_SSSE3(n) \
- void aom_masked_sad4x##n##x4d_ssse3( \
- const uint8_t *src, int src_stride, const uint8_t *ref[4], \
- int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
- int msk_stride, int inv_mask, unsigned sad_array[4]) { \
- aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
- 4, msk, msk_stride, n, inv_mask, sad_array); \
+#define MASKSAD4XN_SSSE3(n) \
+ void aom_masked_sad4x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \
+ msk, msk_stride, n, inv_mask, sad_array); \
}
MASKSADMXN_SSSE3(128, 128)
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm
index d1d8373456..f424ce01dd 100644
--- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm
@@ -15,21 +15,6 @@
SECTION_RODATA
pw_8: times 8 dw 8
-bilin_filter_m_sse2: times 8 dw 16
- times 8 dw 0
- times 8 dw 14
- times 8 dw 2
- times 8 dw 12
- times 8 dw 4
- times 8 dw 10
- times 8 dw 6
- times 16 dw 8
- times 8 dw 6
- times 8 dw 10
- times 8 dw 4
- times 8 dw 12
- times 8 dw 2
- times 8 dw 14
bilin_filter_m_ssse3: times 8 db 16, 0
times 8 db 14, 2
@@ -109,9 +94,6 @@ SECTION .text
%if cpuflag(ssse3)
%define bilin_filter_m bilin_filter_m_ssse3
%define filter_idx_shift 4
-%else
-%define bilin_filter_m bilin_filter_m_sse2
-%define filter_idx_shift 5
%endif
; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
; 11, not 13, if the registers are ordered correctly. May make a minor speed
@@ -1449,21 +1431,11 @@ SECTION .text
; location in the sse/2 version, rather than duplicating that code in the
; binary.
-INIT_XMM sse2
-SUBPEL_VARIANCE 4
-SUBPEL_VARIANCE 8
-SUBPEL_VARIANCE 16
-
INIT_XMM ssse3
SUBPEL_VARIANCE 4
SUBPEL_VARIANCE 8
SUBPEL_VARIANCE 16
-INIT_XMM sse2
-SUBPEL_VARIANCE 4, 1
-SUBPEL_VARIANCE 8, 1
-SUBPEL_VARIANCE 16, 1
-
INIT_XMM ssse3
SUBPEL_VARIANCE 4, 1
SUBPEL_VARIANCE 8, 1
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
index 6744ec51d0..74318de2e5 100644
--- a/third_party/aom/aom_dsp/x86/synonyms.h
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -46,6 +46,25 @@ static INLINE __m128i xx_loadu_128(const void *a) {
return _mm_loadu_si128((const __m128i *)a);
}
+
+// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function
+// manually on older compilers.
+#if !defined(__clang__) && __GNUC_MAJOR__ < 9
+static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+ __m64 hi_, lo_;
+ memcpy(&hi_, hi, sizeof(hi_));
+ memcpy(&lo_, lo, sizeof(lo_));
+ return _mm_set_epi64(hi_, lo_);
+}
+#else
+// Load 64 bits from each of hi and low, and pack into an SSE register
+// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate
+// the strict aliasing rule, this takes a different approach
+static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) {
+ return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi));
+}
+#endif
+
static INLINE void xx_storel_32(void *const a, const __m128i v) {
const int val = _mm_cvtsi128_si32(v);
memcpy(a, &val, sizeof(val));
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
index b729e5f410..7548d4d4f4 100644
--- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -43,6 +43,16 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) {
_mm256_storeu_si256((__m256i *)a, v);
}
+// Fill an AVX register using an interleaved pair of values, ie. set the
+// 16 channels to {a, b} repeated 8 times, using the same channel ordering
+// as when a register is stored to / loaded from memory.
+//
+// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
+// instruction
+static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) {
+ return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b);
+}
+
// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
// compilers. The following function is equivalent to _mm256_set1_epi64x()
// acting on a 32-bit integer.
@@ -61,11 +71,26 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
}
+#define GCC_VERSION (__GNUC__ * 10000 \
+ + __GNUC_MINOR__ * 100 \
+ + __GNUC_PATCHLEVEL__)
+
+// _mm256_loadu2_m128i has been introduced in GCC 10.1
+#if !defined(__clang__) && GCC_VERSION < 101000
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+ return _mm256_set_m128i(mhi, mlo);
+}
+#else
static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
__m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
__m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
return yy_set_m128i(mhi, mlo);
}
+#endif
+
+#undef GCC_VERSION
static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
_mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
index 046d6f10f8..0f872fc392 100644
--- a/third_party/aom/aom_dsp/x86/variance_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -518,8 +518,8 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
}
}
-uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
- int sstride, int h) {
+static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
uint64_t sum = 0;
__m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
__m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
@@ -575,8 +575,9 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
// buffer, thus dstride is a frame level stride.
-uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
- int src_blk_stride, int h) {
+static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride,
+ uint16_t *src, int src_blk_stride,
+ int h) {
uint64_t sum = 0;
__m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
__m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
@@ -665,8 +666,8 @@ uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
return sum;
}
-uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
- int sstride, int h) {
+static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
uint64_t sum = 0;
__m128i dst0_8x8, dst1_8x8, dst3_16x8;
__m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
@@ -715,8 +716,9 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
// buffer, thus dstride is a frame level stride.
-uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
- int src_blk_stride, int h) {
+static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride,
+ uint16_t *src, int src_blk_stride,
+ int h) {
uint64_t sum = 0;
__m128i dst0_16x8, dst1_16x8;
__m256i dst0_16x16, dst1_16x16;
@@ -780,8 +782,8 @@ uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
"w=8/4 and h=8/4 must be satisfied");
switch (w) {
- case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
- case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+ case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+ case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
default: assert(0 && "unsupported width"); return -1;
}
}
@@ -795,8 +797,8 @@ uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
"w=8/4 and h=8/4 must be satisfied");
switch (w) {
- case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
- case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+ case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+ case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
default: assert(0 && "unsupported width"); return -1;
}
}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
index 9e9e70ea01..57a1cee781 100644
--- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -648,7 +648,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2)
#endif
#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \
- int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ static int sub_pixel_avg_variance32x##height##_imp_avx2( \
const uint8_t *src, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
unsigned int *sse) { \
@@ -876,7 +876,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2)
const uint8_t *src, int src_stride, int x_offset, int y_offset, \
const uint8_t *dst, int dst_stride, unsigned int *sse, \
const uint8_t *sec_ptr) { \
- const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \
src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \
sse); \
return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \
@@ -899,7 +899,7 @@ MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
const uint8_t *sec_ptr = sec; \
for (int j = 0; j < (h / hf); ++j) { \
unsigned int sse2; \
- const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+ const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
sec_ptr, w, &sse2); \
dst_ptr += hf * dst_stride; \
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
index faec9cf73d..81b30072a5 100644
--- a/third_party/aom/aom_dsp/x86/variance_sse2.c
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -415,7 +415,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
DECL(8, opt); \
DECL(16, opt)
-DECLS(sse2);
DECLS(ssse3);
#undef DECLS
#undef DECL
@@ -492,7 +491,6 @@ DECLS(ssse3);
FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
#endif
-FNS(sse2)
FNS(ssse3)
#undef FNS
@@ -510,7 +508,6 @@ FNS(ssse3)
DECL(8, opt); \
DECL(16, opt)
-DECLS(sse2);
DECLS(ssse3);
#undef DECL
#undef DECLS
@@ -591,7 +588,6 @@ DECLS(ssse3);
FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
#endif
-FNS(sse2)
FNS(ssse3)
#undef FNS
@@ -710,8 +706,8 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
}
}
-uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
- int sstride, int h) {
+static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
uint64_t sum = 0;
__m128i dst0_8x8, dst1_8x8, dst_16x8;
__m128i src0_16x4, src1_16x4, src_16x8;
@@ -744,8 +740,8 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
return sum;
}
-uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
- int sstride, int h) {
+static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
uint64_t sum = 0;
__m128i dst_8x8, dst_16x8;
__m128i src_16x8;
@@ -781,8 +777,8 @@ uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
"w=8/4 and h=8/4 must satisfy");
switch (w) {
- case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
- case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+ case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+ case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
default: assert(0 && "unsupported width"); return -1;
}
}