From def92d1b8e9d373e2f6f27c366d578d97d8960c6 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 15 May 2024 05:34:50 +0200 Subject: Merging upstream version 126.0. Signed-off-by: Daniel Baumann --- third_party/aom/aom_dsp/x86/aom_asm_stubs.c | 34 - .../aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c | 569 -------- .../aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm | 615 -------- .../aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm | 295 ---- third_party/aom/aom_dsp/x86/avg_intrin_sse2.c | 2 +- third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h | 6 + third_party/aom/aom_dsp/x86/highbd_variance_avx2.c | 63 +- third_party/aom/aom_dsp/x86/highbd_variance_sse2.c | 12 +- third_party/aom/aom_dsp/x86/intrapred_ssse3.c | 8 +- third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c | 50 +- .../aom/aom_dsp/x86/subpel_variance_sse2.asm | 1470 -------------------- .../aom/aom_dsp/x86/subpel_variance_ssse3.asm | 1442 +++++++++++++++++++ third_party/aom/aom_dsp/x86/synonyms.h | 19 + third_party/aom/aom_dsp/x86/synonyms_avx2.h | 25 + third_party/aom/aom_dsp/x86/variance_avx2.c | 26 +- third_party/aom/aom_dsp/x86/variance_impl_avx2.c | 6 +- third_party/aom/aom_dsp/x86/variance_sse2.c | 16 +- 17 files changed, 1589 insertions(+), 3069 deletions(-) delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm delete mode 100644 third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm delete mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm create mode 100644 third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm (limited to 'third_party/aom/aom_dsp/x86') diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c index b08ec2546b..6c7fdd6eb1 100644 --- a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c +++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c @@ -15,40 +15,6 @@ #include "aom_dsp/x86/convolve.h" #if HAVE_SSE2 -filter8_1dfunction aom_filter_block1d16_v8_sse2; -filter8_1dfunction aom_filter_block1d16_h8_sse2; -filter8_1dfunction aom_filter_block1d8_v8_sse2; -filter8_1dfunction aom_filter_block1d8_h8_sse2; -filter8_1dfunction aom_filter_block1d4_v8_sse2; -filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v4_sse2; -filter8_1dfunction aom_filter_block1d16_h4_sse2; - -filter8_1dfunction aom_filter_block1d8_h4_sse2; -filter8_1dfunction aom_filter_block1d8_v4_sse2; -filter8_1dfunction aom_filter_block1d4_h4_sse2; -filter8_1dfunction aom_filter_block1d4_v4_sse2; - -filter8_1dfunction aom_filter_block1d16_v2_sse2; -filter8_1dfunction aom_filter_block1d16_h2_sse2; -filter8_1dfunction aom_filter_block1d8_v2_sse2; -filter8_1dfunction aom_filter_block1d8_h2_sse2; -filter8_1dfunction aom_filter_block1d4_v2_sse2; -filter8_1dfunction aom_filter_block1d4_h2_sse2; - -// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) - #if CONFIG_AV1_HIGHBITDEPTH highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c deleted file mode 100644 index 5c36b68727..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c +++ /dev/null @@ -1,569 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // SSE2 - -#include "config/aom_dsp_rtcd.h" -#include "aom_dsp/x86/convolve.h" -#include "aom_ports/mem.h" - -void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, - srcRegFilt32b2_2; - __m128i srcReg32b1, srcReg32b2; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); - __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // reading stride of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); - - ss_2 = _mm_srli_si128(srcReg32b2, 2); - ss_4 = _mm_srli_si128(srcReg32b2, 4); - ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_1, secondFilters); - d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); - - ss_1 = _mm_srli_si128(srcReg32b2, 3); - ss_3 = _mm_srli_si128(srcReg32b2, 5); - ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); - - res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_pixels_per_line; - - _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; - __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; - __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); - __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); - __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); - resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); - resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); - __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); - resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); - __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); - resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); - resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); - resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); - resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); - - src_ptr += src_stride; - - _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); - _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg23_hi_1 = resReg45_hi_1; - resReg23_hi_2 = resReg45_hi_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - resReg34_hi_1 = resReg56_hi_1; - resReg34_hi_2 = resReg56_hi_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_3, secondFilters); - d2 = _mm_madd_epi16(ss_5, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg34_lo; - __m128i srcReg45_lo, srcReg56_lo; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_45_lo, resReg34_56_lo; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); - - src_ptr += src_stride; - - _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); - _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - - __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); - __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); - - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23, srcReg34, srcReg45, srcReg56; - __m128i resReg23_34, resReg45_56; - __m128i resReg23_34_45_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); - - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - tmp_0 = _mm_madd_epi16(resReg23, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34, secondFilters); - resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); - __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); - - tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); - resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); - - // shift by 6 bit each 16 bit - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); - resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_34_45_56 = - _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); - - src_ptr += src_stride; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); - *((int *)(output_ptr + out_pitch)) = - _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23 = resReg45; - resReg34 = resReg56; - srcReg4 = srcReg6; - } -} diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm deleted file mode 100644 index 640c5b2416..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm +++ /dev/null @@ -1,615 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_v8_sse2) -sym(aom_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_v8_sse2) -sym(aom_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_v8_sse2) -sym(aom_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_h8_sse2) -sym(aom_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_h8_sse2) -sym(aom_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_h8_sse2) -sym(aom_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm deleted file mode 100644 index 90dd55a4be..0000000000 --- a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm +++ /dev/null @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -globalsym(aom_filter_block1d4_v2_sse2) -sym(aom_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_v2_sse2) -sym(aom_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_v2_sse2) -sym(aom_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d4_h2_sse2) -sym(aom_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_h2_sse2) -sym(aom_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_h2_sse2) -sym(aom_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c index 9ab9143eee..0b552b704b 100644 --- a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c +++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c @@ -133,7 +133,7 @@ unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) { return (avg + 32) >> 6; } -void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { +static void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) { __m128i sum0, sum1, s0, s1, s2, s3, u0; u0 = _mm_setzero_si128(); s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0); diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h index 7ee8ba330e..e1db3b950c 100644 --- a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h +++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h @@ -30,6 +30,7 @@ #define SUB_EPI16 _mm_sub_epi16 #endif +#if defined(FDCT4x4_2D_HELPER) static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, __m128i *in1) { // Constants @@ -185,7 +186,9 @@ static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0, } } } +#endif // defined(FDCT4x4_2D_HELPER) +#if defined(FDCT4x4_2D) void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { // This 2D transform implements 4 vertical 1D transforms followed // by 4 horizontal 1D transforms. The multiplies and adds are as given @@ -205,13 +208,16 @@ void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) { storeu_output(&in0, output + 0 * 4); storeu_output(&in1, output + 2 * 4); } +#endif // defined(FDCT4x4_2D) +#if defined(FDCT4x4_2D_LP) void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) { __m128i in0, in1; FDCT4x4_2D_HELPER(input, stride, &in0, &in1); _mm_storeu_si128((__m128i *)(output + 0 * 4), in0); _mm_storeu_si128((__m128i *)(output + 2 * 4), in1); } +#endif // defined(FDCT4x4_2D_LP) #if CONFIG_INTERNAL_STATS void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) { diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c index b4ff91d856..21e9e8b282 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c @@ -618,9 +618,9 @@ static uint32_t aom_highbd_var_filter_block2d_bil_avx2( return (var > 0) ? var : 0; } -void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); for (int i = 0; i < 8; i += 2) { @@ -653,9 +653,9 @@ void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, *sse = _mm_extract_epi32(v_d, 1); } -void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); @@ -703,19 +703,19 @@ static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_avx2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + highbd_calc##block_size##x##block_size##var_avx2, \ + block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) @@ -741,6 +741,17 @@ VAR_FN(8, 32, 8, 8) #undef VAR_FN +unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, + unsigned int *sse) { + int sum; + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, + highbd_calc16x16var_avx2, 16); + return *sse; +} + #define SSE2_HEIGHT(H) \ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ @@ -749,7 +760,7 @@ VAR_FN(8, 32, 8, 8) SSE2_HEIGHT(8) SSE2_HEIGHT(16) -#undef SSE2_Height +#undef SSE2_HEIGHT #define HIGHBD_SUBPIX_VAR(W, H) \ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ @@ -782,8 +793,8 @@ HIGHBD_SUBPIX_VAR(8, 8) #undef HIGHBD_SUBPIX_VAR -uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; __m256i src0_8x16, src1_8x16, src_16x16; @@ -840,8 +851,8 @@ uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, return sum; } -uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; @@ -897,8 +908,8 @@ uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c index e897aab645..2fc2e1c0dd 100644 --- a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c @@ -637,8 +637,8 @@ void aom_highbd_dist_wtd_comp_avg_pred_sse2( } } -uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16; __m128i src_8x16; @@ -682,8 +682,8 @@ uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, return sum; } -uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i src_8x16; __m128i dst_8x16; @@ -728,8 +728,8 @@ uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c index fd48260c6f..869f880bda 100644 --- a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c +++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c @@ -940,10 +940,10 @@ static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) { return _mm_unpacklo_epi16((x), _mm_setzero_si128()); } -void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, - const uint8_t *LIBAOM_RESTRICT top_row, - const uint8_t *LIBAOM_RESTRICT left_column, int width, - int height) { +static void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride, + const uint8_t *LIBAOM_RESTRICT top_row, + const uint8_t *LIBAOM_RESTRICT left_column, + int width, int height) { const uint8_t *const sm_weights_h = smooth_weights + height - 4; const uint8_t *const sm_weights_w = smooth_weights + width - 4; const __m128i zero = _mm_setzero_si128(); diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c index 799ce9ef44..d96a9dd23d 100644 --- a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c +++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c @@ -103,11 +103,12 @@ static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride, pred = _mm_packus_epi16(pred_l, pred_r); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -164,11 +165,12 @@ void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src)); -void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, - const uint8_t *ref_array[4], int a_stride, - const uint8_t *b_ptr, int b_stride, - const uint8_t *m_ptr, int m_stride, int height, - int inv_mask, unsigned sad_array[4]) { +static void masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_array[4], int a_stride, + const uint8_t *b_ptr, int b_stride, + const uint8_t *m_ptr, int m_stride, + int height, int inv_mask, + unsigned sad_array[4]) { const uint8_t *ref0 = ref_array[0]; const uint8_t *ref1 = ref_array[1]; const uint8_t *ref2 = ref_array[2]; @@ -224,22 +226,22 @@ void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride, msk_stride, m, n, inv_mask, sad_array); \ } -#define MASKSAD8XN_SSSE3(n) \ - void aom_masked_sad8x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 8, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD8XN_SSSE3(n) \ + void aom_masked_sad8x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 8, \ + msk, msk_stride, n, inv_mask, sad_array); \ } -#define MASKSAD4XN_SSSE3(n) \ - void aom_masked_sad4x##n##x4d_ssse3( \ - const uint8_t *src, int src_stride, const uint8_t *ref[4], \ - int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ - int msk_stride, int inv_mask, unsigned sad_array[4]) { \ - aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \ - 4, msk, msk_stride, n, inv_mask, sad_array); \ +#define MASKSAD4XN_SSSE3(n) \ + void aom_masked_sad4x##n##x4d_ssse3( \ + const uint8_t *src, int src_stride, const uint8_t *ref[4], \ + int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \ + int msk_stride, int inv_mask, unsigned sad_array[4]) { \ + masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, 4, \ + msk, msk_stride, n, inv_mask, sad_array); \ } MASKSADMXN_SSSE3(128, 128) diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm deleted file mode 100644 index d1d8373456..0000000000 --- a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm +++ /dev/null @@ -1,1470 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 - -bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 14, 2 - times 8 db 12, 4 - times 8 db 10, 6 - times 16 db 8 - times 8 db 6, 10 - times 8 db 4, 12 - times 8 db 2, 14 - -SECTION .text - -; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, -; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, -; int height, unsigned int *sse); -; -; This function returns the SE and stores SSE in the given pointer. - -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse - psubw %3, %4 - psubw %1, %2 - paddw %5, %3 - pmaddwd %3, %3 - paddw %5, %1 - pmaddwd %1, %1 - paddd %6, %3 - paddd %6, %1 -%endmacro - -%macro STORE_AND_RET 1 -%if %1 > 4 - ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit - ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. - ; We have to sign-extend it before adding the words within the register - ; and outputing to a dword. - pcmpgtw m5, m6 ; mask for 0 > x - movhlps m3, m7 - punpcklwd m4, m6, m5 - punpckhwd m6, m5 ; sign-extend m6 word->dword - paddd m7, m3 - paddd m6, m4 - pshufd m3, m7, 0x1 - movhlps m4, m6 - paddd m7, m3 - paddd m6, m4 - mov r1, ssem ; r1 = unsigned int *sse - pshufd m4, m6, 0x1 - movd [r1], m7 ; store sse - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%else ; 4xh - pshuflw m4, m6, 0xe - pshuflw m3, m7, 0xe - paddw m6, m4 - paddd m7, m3 - pcmpgtw m5, m6 ; mask for 0 > x - mov r1, ssem ; r1 = unsigned int *sse - punpcklwd m6, m5 ; sign-extend m6 word->dword - movd [r1], m7 ; store sse - pshuflw m4, m6, 0xe - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%endif - RET -%endmacro - -%macro INC_SRC_BY_SRC_STRIDE 0 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 - add srcq, src_stridemp -%else - add srcq, src_strideq -%endif -%endmacro - -%macro SUBPEL_VARIANCE 1-2 0 ; W -%if cpuflag(ssse3) -%define bilin_filter_m bilin_filter_m_ssse3 -%define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 -%endif -; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses -; 11, not 13, if the registers are ordered correctly. May make a minor speed -; difference on Win64 - -%if AOM_ARCH_X86_64 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq - %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %endif - %define block_height heightd - %define bilin_filter sseq -%else - %if CONFIG_PIC=1 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - - ; reuse argument stack space - %define g_bilin_filterm x_offsetm - %define g_pw_8m y_offsetm - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %else - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, sec, sec_stride, \ - height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - %define bilin_filter bilin_filter_m - %endif -%endif - -%if %1 == 4 - %define movx movd -%else - %define movx movh -%endif - - ASSERT %1 <= 16 ; m6 overflows if w > 16 - pxor m6, m6 ; sum - pxor m7, m7 ; sse - ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we - ; could perhaps use it for something more productive then - pxor m5, m5 ; dedicated zero register -%if %1 < 16 - sar block_height, 1 -%if %2 == 1 ; avg - shl sec_str, 1 -%endif -%endif - - ; FIXME(rbultje) replace by jumptable? - test x_offsetd, x_offsetd - jnz .x_nonzero - ; x_offset == 0 - test y_offsetd, y_offsetd - jnz .x_zero_y_nonzero - - ; x_offset == 0 && y_offset == 0 -.x_zero_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - mova m1, [dstq] -%if %2 == 1 ; avg - pavgb m0, [secq] - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - -%if %2 == 0 ; !avg - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] -%endif - - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - -%if %2 == 1 ; avg -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_zero_loop - STORE_AND_RET %1 - -.x_zero_y_nonzero: - cmp y_offsetd, 4 - jne .x_zero_y_nonhalf - - ; x_offset == 0 && y_offset == 0.5 -.x_zero_y_half_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq*2] -%else ; 4xh - movx m1, [srcq+src_strideq*2] - punpckldq m2, m1 -%endif - movx m1, [dstq] -%if %1 > 4 - movlhps m0, m2 -%else ; 4xh - punpckldq m0, m2 -%endif - movx m3, [dstq+dst_strideq] - pavgb m0, m2 - punpcklbw m1, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m3, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m4, [secq] - pavgb m0, m4 - punpcklbw m3, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq*2] - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_half_loop - STORE_AND_RET %1 - -.x_zero_y_nonhalf: - ; x_offset == 0 && y_offset == bilin interpolation -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86-32 or mmx -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0, reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_zero_y_other_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can - ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of - ; instructions is the same (5), but it is 1 mul instead of 2, so might be - ; slightly faster because of pmullw latency. It would also cut our rodata - ; tables in half for this function, and save 1-2 registers on x86-64. - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m4, filter_y_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonzero: - cmp x_offsetd, 4 - jne .x_nonhalf - ; x_offset == 0.5 - test y_offsetd, y_offsetd - jnz .x_half_y_nonzero - - ; x_offset == 0.5 && y_offset == 0 -.x_half_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m4, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] - movhps m4, [srcq+src_strideq+1] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 - movx m2, [srcq+src_strideq+1] - punpckldq m4, m2 -%endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - pavgb m0, m4 - punpcklbw m3, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m1, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] - movx m1, [dstq] - pavgb m0, m4 - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_zero_loop - STORE_AND_RET %1 - -.x_half_y_nonzero: - cmp y_offsetd, 4 - jne .x_half_y_nonhalf - - ; x_offset == 0.5 && y_offset == 0.5 -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - pavgb m4, m3 - punpckhbw m3, m1, m5 - pavgb m0, m4 -%if %2 == 1 ; avg - punpcklbw m1, m5 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movx m2, [srcq] - movx m3, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq] - movhps m3, [srcq+src_strideq+1] -%else - movx m1, [srcq+src_strideq] - punpckldq m2, m1 - movx m1, [srcq+src_strideq+1] - punpckldq m3, m1 -%endif - pavgb m2, m3 -%if %1 > 4 - movlhps m0, m2 - movhlps m4, m2 -%else ; 4xh - punpckldq m0, m2 - pshuflw m4, m2, 0xe -%endif - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq] - movx m1, [srcq+src_strideq+1] - pavgb m2, m3 - pavgb m4, m1 - pavgb m0, m2 - pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_half_loop - STORE_AND_RET %1 - -.x_half_y_nonhalf: - ; x_offset == 0.5 && y_offset == bilin interpolation -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ;x86_32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0.5. We can reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_other_loop: - movu m4, [srcq] - movu m2, [srcq+1] - mova m1, [dstq] - pavgb m4, m2 -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - punpcklbw m0, m5 - paddw m2, m3 - punpcklbw m3, m4, m5 - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 -%endif - punpckhbw m3, m1, m5 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -%if notcpuflag(ssse3) - punpcklbw m0, m5 -%endif -.x_half_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] - pavgb m2, m1 - pavgb m4, m3 - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - paddw m0, m1 - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m2, m1 - movx m1, [dstq] -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf: - test y_offsetd, y_offsetd - jnz .x_nonhalf_y_nonzero - - ; x_offset == bilin interpolation && y_offset == 0 -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -;y_offset == 0. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_other_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - pmullw m0, filter_x_a - pmullw m4, filter_x_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - punpcklbw m0, m1 - movx m1, [dstq] - punpcklbw m2, m4 - pmaddubsw m0, filter_x_a - pmaddubsw m2, filter_x_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_zero_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonzero: - cmp y_offsetd, 4 - jne .x_nonhalf_y_nonhalf - - ; x_offset == bilin interpolation && y_offset == 0.5 -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0.5. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - add srcq, src_strideq - packuswb m0, m2 -.x_other_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] -%if cpuflag(ssse3) - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - pavgb m0, m4 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%else - punpckhbw m2, m4, m5 - punpckhbw m1, m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - paddw m4, m3 - paddw m2, m1 - mova m1, [dstq] - psraw m4, 4 - psraw m2, 4 - punpckhbw m3, m1, m5 - ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we - ; have a 1-register shortage to be able to store the backup of the bilin - ; filtered second line as words as cache for the next line. Packing into - ; a byte costs 1 pack and 2 unpacks, but saves a register. - packuswb m4, m2 - punpcklbw m1, m5 - pavgb m0, m4 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - pavgb m0, [secq] -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - add srcq, src_strideq - psraw m0, 4 -.x_other_y_half_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - paddw m2, filter_rnd - paddw m4, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - movx m1, [dstq] - paddw m4, m3 - movx m3, [dstq+dst_strideq] -%endif - psraw m2, 4 - psraw m4, 4 - pavgw m0, m2 - pavgw m2, m4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - also consider going to bytes here -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_half_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonhalf: -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m11, [bilin_filter+y_offsetq+16] -%endif - mova m12, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_y_a m10 -%define filter_y_b m11 -%define filter_rnd m12 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; In this case, there is NO unused register. Used src_stride register. Later, -; src_stride has to be loaded from stack when it is needed. -%define tempq src_strideq - mov tempq, g_bilin_filterm - add x_offsetq, tempq - add y_offsetq, tempq -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter - add y_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - - ; x_offset == bilin interpolation && y_offset == bilin interpolation -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - - INC_SRC_BY_SRC_STRIDE - - packuswb m0, m2 -.x_other_y_other_loop: -%if cpuflag(ssse3) - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - punpckhbw m3, m1, m5 - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - punpcklbw m1, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 - psraw m0, 4 -%else - movu m3, [srcq] - movu m4, [srcq+1] - punpckhbw m1, m3, m5 - punpckhbw m2, m4, m5 - punpcklbw m3, m5 - punpcklbw m4, m5 - pmullw m3, filter_x_a - pmullw m4, filter_x_b - paddw m3, filter_rnd - pmullw m1, filter_x_a - pmullw m2, filter_x_b - paddw m1, filter_rnd - paddw m3, m4 - paddw m1, m2 - psraw m3, 4 - psraw m1, 4 - packuswb m4, m3, m1 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - pmullw m2, filter_y_a - pmullw m1, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m2, m1 - mova m1, [dstq] - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 - punpckhbw m3, m1, m5 - psraw m0, 4 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - psraw m0, 4 -%if cpuflag(ssse3) - packuswb m0, m0 -%endif - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - - INC_SRC_BY_SRC_STRIDE - movx m4, [srcq] - movx m3, [srcq+1] - -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m2, m2 - packuswb m4, m4 - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd - psraw m0, 4 - psraw m2, 4 - punpcklbw m1, m5 -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - paddw m4, m3 - psraw m2, 4 - psraw m4, 4 - pmullw m0, filter_y_a - pmullw m3, m2, filter_y_b - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m0, m3 - movx m3, [dstq+dst_strideq] - paddw m2, m1 - movx m1, [dstq] - psraw m0, 4 - psraw m2, 4 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_other_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd -%undef movx - STORE_AND_RET %1 -%endmacro - -; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical -; between the ssse3 and non-ssse3 version. It may make sense to merge their -; code in the sense that the ssse3 version would jump to the appropriate -; location in the sse/2 version, rather than duplicating that code in the -; binary. - -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm new file mode 100644 index 0000000000..f424ce01dd --- /dev/null +++ b/third_party/aom/aom_dsp/x86/subpel_variance_ssse3.asm @@ -0,0 +1,1442 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if AOM_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h index 6744ec51d0..74318de2e5 100644 --- a/third_party/aom/aom_dsp/x86/synonyms.h +++ b/third_party/aom/aom_dsp/x86/synonyms.h @@ -46,6 +46,25 @@ static INLINE __m128i xx_loadu_128(const void *a) { return _mm_loadu_si128((const __m128i *)a); } + +// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function +// manually on older compilers. +#if !defined(__clang__) && __GNUC_MAJOR__ < 9 +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + __m64 hi_, lo_; + memcpy(&hi_, hi, sizeof(hi_)); + memcpy(&lo_, lo, sizeof(lo_)); + return _mm_set_epi64(hi_, lo_); +} +#else +// Load 64 bits from each of hi and low, and pack into an SSE register +// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate +// the strict aliasing rule, this takes a different approach +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi)); +} +#endif + static INLINE void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h index b729e5f410..7548d4d4f4 100644 --- a/third_party/aom/aom_dsp/x86/synonyms_avx2.h +++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h @@ -43,6 +43,16 @@ static INLINE void yy_storeu_256(void *const a, const __m256i v) { _mm256_storeu_si256((__m256i *)a, v); } +// Fill an AVX register using an interleaved pair of values, ie. set the +// 16 channels to {a, b} repeated 8 times, using the same channel ordering +// as when a register is stored to / loaded from memory. +// +// This is useful for rearranging filter kernels for use with the _mm_madd_epi16 +// instruction +static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) { + return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); +} + // The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio // compilers. The following function is equivalent to _mm256_set1_epi64x() // acting on a 32-bit integer. @@ -61,11 +71,26 @@ static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) { return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); } +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) + +// _mm256_loadu2_m128i has been introduced in GCC 10.1 +#if !defined(__clang__) && GCC_VERSION < 101000 +static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { + __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); + __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); + return _mm256_set_m128i(mhi, mlo); +} +#else static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) { __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); return yy_set_m128i(mhi, mlo); } +#endif + +#undef GCC_VERSION static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) { _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c index 046d6f10f8..0f872fc392 100644 --- a/third_party/aom/aom_dsp/x86/variance_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_avx2.c @@ -518,8 +518,8 @@ void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, } } -uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; @@ -575,8 +575,9 @@ uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; @@ -665,8 +666,8 @@ uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, return sum; } -uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst3_16x8; __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; @@ -715,8 +716,9 @@ uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8; __m256i dst0_16x16, dst1_16x16; @@ -780,8 +782,8 @@ uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } @@ -795,8 +797,8 @@ uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); - case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); + case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); + case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); default: assert(0 && "unsupported width"); return -1; } } diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c index 9e9e70ea01..57a1cee781 100644 --- a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c +++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c @@ -648,7 +648,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2) #endif #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ - int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + static int sub_pixel_avg_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ unsigned int *sse) { \ @@ -876,7 +876,7 @@ MAKE_SUB_PIXEL_VAR_16XH(4, 2) const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse, \ const uint8_t *sec_ptr) { \ - const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ @@ -899,7 +899,7 @@ MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4) const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ + const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, &sse2); \ dst_ptr += hf * dst_stride; \ diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c index faec9cf73d..81b30072a5 100644 --- a/third_party/aom/aom_dsp/x86/variance_sse2.c +++ b/third_party/aom/aom_dsp/x86/variance_sse2.c @@ -415,7 +415,6 @@ unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride, DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECLS #undef DECL @@ -492,7 +491,6 @@ DECLS(ssse3); FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -510,7 +508,6 @@ FNS(ssse3) DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECL #undef DECLS @@ -591,7 +588,6 @@ DECLS(ssse3); FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -710,8 +706,8 @@ void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8, } } -uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst_16x8; __m128i src0_16x4, src1_16x4, src_16x8; @@ -744,8 +740,8 @@ uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, return sum; } -uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst_8x8, dst_16x8; __m128i src_16x8; @@ -781,8 +777,8 @@ uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } -- cgit v1.2.3