summaryrefslogtreecommitdiffstats
path: root/third_party/aom/aom_dsp/x86
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-19 00:47:55 +0000
commit26a029d407be480d791972afb5975cf62c9360a6 (patch)
treef435a8308119effd964b339f76abb83a57c29483 /third_party/aom/aom_dsp/x86
parentInitial commit. (diff)
downloadfirefox-26a029d407be480d791972afb5975cf62c9360a6.tar.xz
firefox-26a029d407be480d791972afb5975cf62c9360a6.zip
Adding upstream version 124.0.1.upstream/124.0.1
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'third_party/aom/aom_dsp/x86')
-rw-r--r--third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c244
-rw-r--r--third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c633
-rw-r--r--third_party/aom/aom_dsp/x86/aom_asm_stubs.c95
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c256
-rw-r--r--third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c308
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm613
-rw-r--r--third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm367
-rw-r--r--third_party/aom/aom_dsp/x86/aom_quantize_avx.c282
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c1441
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c569
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c847
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm615
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm870
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm295
-rw-r--r--third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm267
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_avx2.c897
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse2.c700
-rw-r--r--third_party/aom/aom_dsp/x86/avg_intrin_sse4.c59
-rw-r--r--third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h32
-rw-r--r--third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h49
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c36
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c1374
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c1560
-rw-r--r--third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c285
-rw-r--r--third_party/aom/aom_dsp/x86/blend_mask_sse4.h237
-rw-r--r--third_party/aom/aom_dsp/x86/blend_sse4.h191
-rw-r--r--third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c185
-rw-r--r--third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c138
-rw-r--r--third_party/aom/aom_dsp/x86/common_avx2.h147
-rw-r--r--third_party/aom/aom_dsp/x86/convolve.h204
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_avx2.h922
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_common_intrin.h102
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse2.h122
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_sse4_1.h53
-rw-r--r--third_party/aom/aom_dsp/x86/convolve_ssse3.h50
-rw-r--r--third_party/aom/aom_dsp/x86/fft_avx2.c74
-rw-r--r--third_party/aom/aom_dsp/x86/fft_sse2.c173
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h529
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c39
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h160
-rw-r--r--third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm379
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c456
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c732
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c1248
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c351
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c439
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm259
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c984
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c66
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c1698
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c294
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c208
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm344
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_avx2.c720
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm524
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm1024
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c266
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_avx2.c904
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm318
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse2.c735
-rw-r--r--third_party/aom/aom_dsp/x86/highbd_variance_sse4.c216
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm608
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_avx2.c4707
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse2.c1411
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_sse4.c1307
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_ssse3.c2997
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_utils.h205
-rw-r--r--third_party/aom/aom_dsp/x86/intrapred_x86.h38
-rw-r--r--third_party/aom/aom_dsp/x86/inv_wht_sse2.asm107
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_sad_sse2.c238
-rw-r--r--third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c161
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_avx2.c1016
-rw-r--r--third_party/aom/aom_dsp/x86/loopfilter_sse2.c2973
-rw-r--r--third_party/aom/aom_dsp/x86/lpf_common_sse2.h721
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c266
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c389
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c400
-rw-r--r--third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h33
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c1067
-rw-r--r--third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h92
-rw-r--r--third_party/aom/aom_dsp/x86/mem_sse2.h167
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h58
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h54
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_avx2.c271
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_sad_sse4.c269
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_avx2.c191
-rw-r--r--third_party/aom/aom_dsp/x86/obmc_variance_sse4.c382
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_avx2.c274
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_sse2.c125
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3.c192
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm302
-rw-r--r--third_party/aom/aom_dsp/x86/quantize_x86.h202
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_avx2.c326
-rw-r--r--third_party/aom/aom_dsp/x86/sad4d_sse2.asm437
-rw-r--r--third_party/aom/aom_dsp/x86/sad_avx2.c219
-rw-r--r--third_party/aom/aom_dsp/x86/sad_impl_avx2.c181
-rw-r--r--third_party/aom/aom_dsp/x86/sad_sse2.asm432
-rw-r--r--third_party/aom/aom_dsp/x86/sse_avx2.c389
-rw-r--r--third_party/aom/aom_dsp/x86/sse_sse4.c355
-rw-r--r--third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm222
-rw-r--r--third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm1470
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_avx2.c109
-rw-r--r--third_party/aom/aom_dsp/x86/subtract_sse2.asm147
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_avx2.c326
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.c478
-rw-r--r--third_party/aom/aom_dsp/x86/sum_squares_sse2.h28
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms.h134
-rw-r--r--third_party/aom/aom_dsp/x86/synonyms_avx2.h79
-rw-r--r--third_party/aom/aom_dsp/x86/transpose_sse2.h424
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_avx2.h357
-rw-r--r--third_party/aom/aom_dsp/x86/txfm_common_sse2.h33
-rw-r--r--third_party/aom/aom_dsp/x86/variance_avx2.c961
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_avx2.c924
-rw-r--r--third_party/aom/aom_dsp/x86/variance_impl_ssse3.c129
-rw-r--r--third_party/aom/aom_dsp/x86/variance_sse2.c802
115 files changed, 58370 insertions, 0 deletions
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..b3dede75d5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_avx2.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+ const int16_t *round_ptr, __m256i *round,
+ const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr,
+ __m256i *dequant,
+ const int16_t *shift_ptr,
+ __m256i *shift) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void update_mask1_avx2(__m256i *cmp_mask,
+ const int16_t *iscan_ptr, int *is_found,
+ __m256i *mask) {
+ __m256i temp_mask = _mm256_setzero_si256();
+ if (_mm256_movemask_epi8(*cmp_mask)) {
+ __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+ temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+ *is_found = 1;
+ }
+ *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0_avx2(__m256i *qcoeff, __m256i *threshold,
+ const int16_t *iscan_ptr, int *is_found,
+ __m256i *mask) {
+ __m256i zero = _mm256_setzero_si256();
+ __m256i coeff[2], cmp_mask0, cmp_mask1;
+ coeff[0] = _mm256_unpacklo_epi16(*qcoeff, zero);
+ coeff[1] = _mm256_unpackhi_epi16(*qcoeff, zero);
+ coeff[0] = _mm256_slli_epi32(coeff[0], AOM_QM_BITS);
+ cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm256_slli_epi32(coeff[1], AOM_QM_BITS);
+ cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+ cmp_mask0 =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+ update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void calculate_qcoeff_avx2(__m256i *coeff, const __m256i *round,
+ const __m256i *quant,
+ const __m256i *shift) {
+ __m256i tmp, qcoeff;
+ qcoeff = _mm256_adds_epi16(*coeff, *round);
+ tmp = _mm256_mulhi_epi16(qcoeff, *quant);
+ qcoeff = _mm256_add_epi16(tmp, qcoeff);
+ *coeff = _mm256_mulhi_epi16(qcoeff, *shift);
+}
+
+static INLINE __m256i calculate_dqcoeff_avx2(__m256i qcoeff, __m256i dequant) {
+ return _mm256_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)(coeff_ptr), coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+void aom_quantize_b_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff, qcoeff;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ int prescan_add[2];
+ int thresh[2];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff = load_coefficients_avx2(coeff_ptr);
+ qcoeff = _mm256_abs_epi16(coeff);
+ update_mask0_avx2(&qcoeff, threshold, iscan, &is_found0, &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+ zbin = _mm256_unpackhi_epi64(zbin, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+ update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ // Reinsert signs
+ qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+ // Mask out zbin threshold coeffs
+ qcoeff = _mm256_and_si256(qcoeff, temp0);
+ store_coefficients_avx2(qcoeff, qcoeff_ptr);
+ coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ store_coefficients_avx2(coeff, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff = load_coefficients_avx2(coeff_ptr + index);
+ qcoeff = _mm256_abs_epi16(coeff);
+ update_mask0_avx2(&qcoeff, threshold, iscan + index, &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi16(qcoeff, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(temp0, 0xd8);
+ update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_avx2(&qcoeff, &round, &quant, &shift);
+ qcoeff = _mm256_sign_epi16(qcoeff, coeff);
+ qcoeff = _mm256_and_si256(qcoeff, temp0);
+ store_coefficients_avx2(qcoeff, qcoeff_ptr + index);
+ coeff = calculate_dqcoeff_avx2(qcoeff, dequant);
+ store_coefficients_avx2(coeff, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff0 = qcoeff_ptr[rc];
+ if (qcoeff0) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff0 = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff0);
+ const int abs_coeff = (coeff0 ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..503b9b4682
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,633 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_quantize_b_32x32_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ const int log_scale = 1;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, log_scale_vec);
+ round = _mm_add_epi16(round, log_scale_vec);
+ zbin = _mm_srli_epi16(zbin, log_scale);
+ round = _mm_srli_epi16(round, log_scale);
+ zbin = _mm_sub_epi16(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_quantize_b_64x64_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ const int log_scale = 2;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, log_scale_vec);
+ round = _mm_add_epi16(round, log_scale_vec);
+ zbin = _mm_srli_epi16(zbin, log_scale);
+ round = _mm_srli_epi16(round, log_scale);
+ zbin = _mm_sub_epi16(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+ index += 16;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_asm_stubs.c b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
new file mode 100644
index 0000000000..b08ec2546b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_asm_stubs.c
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+
+#if HAVE_SSE2
+filter8_1dfunction aom_filter_block1d16_v8_sse2;
+filter8_1dfunction aom_filter_block1d16_h8_sse2;
+filter8_1dfunction aom_filter_block1d8_v8_sse2;
+filter8_1dfunction aom_filter_block1d8_h8_sse2;
+filter8_1dfunction aom_filter_block1d4_v8_sse2;
+filter8_1dfunction aom_filter_block1d4_h8_sse2;
+filter8_1dfunction aom_filter_block1d16_v4_sse2;
+filter8_1dfunction aom_filter_block1d16_h4_sse2;
+
+filter8_1dfunction aom_filter_block1d8_h4_sse2;
+filter8_1dfunction aom_filter_block1d8_v4_sse2;
+filter8_1dfunction aom_filter_block1d4_h4_sse2;
+filter8_1dfunction aom_filter_block1d4_v4_sse2;
+
+filter8_1dfunction aom_filter_block1d16_v2_sse2;
+filter8_1dfunction aom_filter_block1d16_h2_sse2;
+filter8_1dfunction aom_filter_block1d8_v2_sse2;
+filter8_1dfunction aom_filter_block1d8_h2_sse2;
+filter8_1dfunction aom_filter_block1d4_v2_sse2;
+filter8_1dfunction aom_filter_block1d4_h2_sse2;
+
+// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+
+#if CONFIG_AV1_HIGHBITDEPTH
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v8_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h8_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v4_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h4_sse2;
+
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d16_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d8_h2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_v2_sse2;
+highbd_filter8_1dfunction aom_highbd_filter_block1d4_h2_sse2;
+
+// void aom_highbd_convolve8_horiz_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+// void aom_highbd_convolve8_vert_sse2(const uint8_t *src,
+// ptrdiff_t src_stride,
+// uint8_t *dst,
+// ptrdiff_t dst_stride,
+// const int16_t *filter_x,
+// int x_step_q4,
+// const int16_t *filter_y,
+// int y_step_q4,
+// int w, int h, int bd);
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2)
+#endif
+#endif // HAVE_SSE2
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
new file mode 100644
index 0000000000..a1043828fe
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_avx2.c
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
+}
+
+void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+ __m256i s[8];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
+ s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
+ s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
+ s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
+ s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
+
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
+ _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
+ _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
+ _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
+ _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m256i s[2];
+ s[0] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ s[1] = _mm256_loadu_si256((__m256i *)src);
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[0]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m256i s[4];
+ s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
+ s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
+ src += src_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
+ _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ assert(w == 128);
+ do {
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
new file mode 100644
index 0000000000..e78845e97c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_convolve_copy_sse2.c
@@ -0,0 +1,308 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void copy_128(const uint8_t *src, uint8_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
+}
+
+void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 2 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ memmove(dst, src, 4 * sizeof(*src));
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
+
+static INLINE void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+}
+
+static INLINE void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
+ __m128i s[16];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
+ s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
+ s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
+ s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
+ s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
+ s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
+ s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
+ s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
+ s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
+ _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
+ _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
+ _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
+ _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
+ _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
+ _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
+ _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
+ _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
+}
+
+void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
+ uint16_t *dst, ptrdiff_t dst_stride, int w,
+ int h) {
+ if (w >= 16) {
+ assert(!((intptr_t)dst % 16));
+ assert(!(dst_stride % 16));
+ }
+
+ if (w == 2) {
+ do {
+ __m128i s = _mm_loadl_epi64((__m128i *)src);
+ *(int *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ s = _mm_loadl_epi64((__m128i *)src);
+ *(int *)dst = _mm_cvtsi128_si32(s);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 4) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadl_epi64((__m128i *)src);
+ src += src_stride;
+ _mm_storel_epi64((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_storel_epi64((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 8) {
+ do {
+ __m128i s[2];
+ s[0] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ s[1] = _mm_loadu_si128((__m128i *)src);
+ src += src_stride;
+ _mm_store_si128((__m128i *)dst, s[0]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)dst, s[1]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 16) {
+ do {
+ __m128i s[4];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 32) {
+ do {
+ __m128i s[8];
+ s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
+ s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
+ s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
+ s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
+ src += src_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
+ dst += dst_stride;
+ _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
+ _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
+ _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
+ _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else if (w == 64) {
+ do {
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_64(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ } else {
+ do {
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ highbd_copy_128(src, dst);
+ src += src_stride;
+ dst += dst_stride;
+ h -= 2;
+ } while (h);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..d392225906
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm
@@ -0,0 +1,613 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro HIGH_GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm6
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+ punpcklwd xmm1, xmm7
+
+ movdqa k0k6, xmm0
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+ movdqa k1k7, xmm1
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+ punpcklwd xmm0, xmm6 ;two row in one register
+ punpcklwd xmm1, xmm7
+ punpcklwd xmm2, xmm5
+ punpcklwd xmm3, xmm4
+
+ pmaddwd xmm0, k0k6 ;multiply the filter factors
+ pmaddwd xmm1, k1k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm3, k3k4
+
+ paddd xmm0, xmm1 ;sum
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+
+ paddd xmm0, krd ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movq [rdi], xmm0
+%endm
+
+%macro HIGH_GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm6, xmm7
+ punpckhwd xmm2, xmm5
+ punpckhwd xmm3, xmm4
+
+ movdqa k0k1, xmm0 ;store filter factors on stack
+ movdqa k6k7, xmm6
+ movdqa k2k5, xmm2
+ movdqa k3k4, xmm3
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ ;Compute max and min values of a pixel
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm0, rdx
+ movq xmm1, rcx
+ pshufd xmm0, xmm0, 0b
+ movdqa xmm2, xmm0
+ psllw xmm0, xmm1
+ psubw xmm0, xmm2
+ pxor xmm1, xmm1
+ movdqa max, xmm0 ;max value (for clamping)
+ movdqa min, xmm1 ;min value (for clamping)
+%endm
+
+%macro LOAD_VERT_8 1
+ movdqu xmm0, [rsi + %1] ;0
+ movdqu xmm1, [rsi + rax + %1] ;1
+ movdqu xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movdqu xmm7, [rsi + rdx * 2 + %1] ;7
+ movdqu xmm2, [rsi + rax + %1] ;2
+ movdqu xmm3, [rsi + rax * 2 + %1] ;3
+ movdqu xmm4, [rsi + rdx + %1] ;4
+ movdqu xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro HIGH_APPLY_FILTER_8 2
+ movdqu temp, xmm4
+ movdqa xmm4, xmm0
+ punpcklwd xmm0, xmm1
+ punpckhwd xmm4, xmm1
+ movdqa xmm1, xmm6
+ punpcklwd xmm6, xmm7
+ punpckhwd xmm1, xmm7
+ movdqa xmm7, xmm2
+ punpcklwd xmm2, xmm5
+ punpckhwd xmm7, xmm5
+
+ movdqu xmm5, temp
+ movdqu temp, xmm4
+ movdqa xmm4, xmm3
+ punpcklwd xmm3, xmm5
+ punpckhwd xmm4, xmm5
+ movdqu xmm5, temp
+
+ pmaddwd xmm0, k0k1
+ pmaddwd xmm5, k0k1
+ pmaddwd xmm6, k6k7
+ pmaddwd xmm1, k6k7
+ pmaddwd xmm2, k2k5
+ pmaddwd xmm7, k2k5
+ pmaddwd xmm3, k3k4
+ pmaddwd xmm4, k3k4
+
+ paddd xmm0, xmm6
+ paddd xmm0, xmm2
+ paddd xmm0, xmm3
+ paddd xmm5, xmm1
+ paddd xmm5, xmm7
+ paddd xmm5, xmm4
+
+ paddd xmm0, krd ;rounding
+ paddd xmm5, krd
+ psrad xmm0, 7 ;shift
+ psrad xmm5, 7
+ packssdw xmm0, xmm5 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi + %2]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_v8_sse2)
+sym(aom_highbd_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movq xmm0, [rsi] ;load src: row 0
+ movq xmm1, [rsi + rax] ;1
+ movq xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2] ;7
+ movq xmm2, [rsi + rax] ;2
+ movq xmm3, [rsi + rax * 2] ;3
+ movq xmm4, [rsi + rdx] ;4
+ movq xmm5, [rsi + rax * 4] ;5
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_v8_sse2)
+sym(aom_highbd_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_v8_sse2)
+sym(aom_highbd_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rbx, [rbx + rbx]
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ HIGH_APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 16
+ HIGH_APPLY_FILTER_8 0, 16
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d4_h8_sse2)
+sym(aom_highbd_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 7
+ %define k0k6 [rsp + 16 * 0]
+ %define k2k5 [rsp + 16 * 1]
+ %define k3k4 [rsp + 16 * 2]
+ %define k1k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define max [rsp + 16 * 5]
+ %define min [rsp + 16 * 6]
+
+ HIGH_GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm4, [rsi + 2]
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm4
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm4
+
+ psrldq xmm1, 2
+ psrldq xmm6, 4
+ psrldq xmm7, 6
+ psrldq xmm2, 4
+ psrldq xmm3, 6
+ psrldq xmm5, 2
+
+ HIGH_APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 7
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d8_h8_sse2)
+sym(aom_highbd_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_highbd_filter_block1d16_h8_sse2)
+sym(aom_highbd_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 8
+ %define k0k1 [rsp + 16 * 0]
+ %define k6k7 [rsp + 16 * 1]
+ %define k2k5 [rsp + 16 * 2]
+ %define k3k4 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define temp [rsp + 16 * 5]
+ %define max [rsp + 16 * 6]
+ %define min [rsp + 16 * 7]
+
+ HIGH_GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ lea rax, [rax + rax] ;bytes per line
+ lea rdx, [rdx + rdx]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 6] ;load src
+ movdqu xmm1, [rsi - 4]
+ movdqu xmm2, [rsi - 2]
+ movdqu xmm3, [rsi]
+ movdqu xmm4, [rsi + 2]
+ movdqu xmm5, [rsi + 4]
+ movdqu xmm6, [rsi + 6]
+ movdqu xmm7, [rsi + 8]
+
+ HIGH_APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 10] ;load src
+ movdqu xmm1, [rsi + 12]
+ movdqu xmm2, [rsi + 14]
+ movdqu xmm3, [rsi + 16]
+ movdqu xmm4, [rsi + 18]
+ movdqu xmm5, [rsi + 20]
+ movdqu xmm6, [rsi + 22]
+ movdqu xmm7, [rsi + 24]
+
+ HIGH_APPLY_FILTER_8 0, 16
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 8
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..db4cad9bcb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm
@@ -0,0 +1,367 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro HIGH_GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklwd xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm5, rdx
+ movq xmm2, rcx
+ pshufd xmm5, xmm5, 0b
+ movdqa xmm1, xmm5
+ psllw xmm5, xmm2
+ psubw xmm5, xmm1 ;max value (for clamping)
+ pxor xmm2, xmm2 ;min value (for clamping)
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_4 1
+
+ punpcklwd xmm0, xmm1 ;two row in one register
+ pmaddwd xmm0, xmm4 ;multiply the filter factors
+
+ paddd xmm0, xmm3 ;rounding
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm0 ;pack to word
+
+ ;clamp the values
+ pminsw xmm0, xmm5
+ pmaxsw xmm0, xmm2
+
+%if %1
+ movq xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+
+ movq [rdi], xmm0
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x00000040
+
+ movdqa xmm6, [rdx] ;load filters
+
+ pshuflw xmm7, xmm6, 11111111b ;k3
+ pshufhw xmm6, xmm6, 0b ;k4
+ psrldq xmm6, 8
+ punpcklwd xmm7, xmm6 ;k3k4k3k4k3k4k3k4
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ mov rdx, 0x00010001
+ movsxd rcx, DWORD PTR arg(6) ;bps
+ movq xmm3, rdx
+ movq xmm5, rcx
+ pshufd xmm3, xmm3, 0b
+ movdqa xmm1, xmm3
+ psllw xmm3, xmm5
+ psubw xmm3, xmm1 ;max value (for clamping)
+ pxor xmm5, xmm5 ;min value (for clamping)
+
+ movdqa max, xmm3
+ movdqa min, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro HIGH_APPLY_FILTER_8 1
+ movdqa xmm6, xmm0
+ punpckhwd xmm6, xmm1
+ punpcklwd xmm0, xmm1
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+
+ paddd xmm6, xmm4 ;rounding
+ paddd xmm0, xmm4 ;rounding
+ psrad xmm6, 7 ;shift
+ psrad xmm0, 7 ;shift
+ packssdw xmm0, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgw xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+%macro HIGH_APPLY_FILTER_16 1
+ movdqa xmm5, xmm0
+ movdqa xmm6, xmm2
+ punpckhwd xmm5, xmm1
+ punpckhwd xmm6, xmm3
+ punpcklwd xmm0, xmm1
+ punpcklwd xmm2, xmm3
+
+ pmaddwd xmm5, xmm7
+ pmaddwd xmm6, xmm7
+ pmaddwd xmm0, xmm7
+ pmaddwd xmm2, xmm7
+
+ paddd xmm5, xmm4 ;rounding
+ paddd xmm6, xmm4
+ paddd xmm0, xmm4
+ paddd xmm2, xmm4
+
+ psrad xmm5, 7 ;shift
+ psrad xmm6, 7
+ psrad xmm0, 7
+ psrad xmm2, 7
+
+ packssdw xmm0, xmm5 ;pack back to word
+ packssdw xmm2, xmm6 ;pack back to word
+
+ ;clamp the values
+ pminsw xmm0, max
+ pmaxsw xmm0, min
+ pminsw xmm2, max
+ pmaxsw xmm2, min
+
+%if %1
+ movdqu xmm1, [rdi]
+ movdqu xmm3, [rdi + 16]
+ pavgw xmm0, xmm1
+ pavgw xmm2, xmm3
+%endif
+ movdqu [rdi], xmm0 ;store the result
+ movdqu [rdi + 16], xmm2 ;store the result
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_highbd_filter_block1d4_v2_sse2)
+sym(aom_highbd_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movq xmm0, [rsi] ;load src
+ movq xmm1, [rsi + 2*rax]
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d8_v2_sse2)
+sym(aom_highbd_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + 2*rax] ;1
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d16_v2_sse2)
+sym(aom_highbd_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm1, [rsi + 2*rax] ;1
+ movdqu xmm3, [rsi + 2*rax + 16]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d4_h2_sse2)
+sym(aom_highbd_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ HIGH_GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 2
+
+ HIGH_APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d8_h2_sse2)
+sym(aom_highbd_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 8
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+
+ HIGH_APPLY_FILTER_8 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_highbd_filter_block1d16_h2_sse2)
+sym(aom_highbd_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 7
+ SAVE_XMM 9
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 2
+ %define max [rsp + 16 * 0]
+ %define min [rsp + 16 * 1]
+
+ HIGH_GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 2]
+ movdqu xmm2, [rsi + 16]
+ movdqu xmm3, [rsi + 18]
+
+ HIGH_APPLY_FILTER_16 0
+ jnz .loop
+
+ add rsp, 16 * 2
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_quantize_avx.c b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
new file mode 100644
index 0000000000..b2d6d4b76d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_quantize_avx.c
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+ tran_low_t *dqcoeff) {
+ const __m128i low = _mm_mullo_epi16(qcoeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(qcoeff, dequant);
+
+ const __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ const __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ *eob_ptr = 0;
+
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+ if (n_coeffs == 16) return;
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+ continue;
+ }
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
+
+void aom_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m256i big_zero = _mm256_setzero_si256();
+ int index;
+ const int log_scale = 1;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+
+ // Setup global values.
+ // The 32x32 halves zbin and round.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, one);
+ zbin = _mm_srli_epi16(zbin, 1);
+ // x86 has no "greater *or equal*" comparison. Subtract 1 from zbin so
+ // it is a strict "greater" comparison.
+ zbin = _mm_sub_epi16(zbin, one);
+
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ round = _mm_add_epi16(round, one);
+ round = _mm_srli_epi16(round, 1);
+
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_tran_low(coeff_ptr);
+ coeff1 = load_tran_low(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC.
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), big_zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr);
+ store_tran_low(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero, dqcoeff_ptr,
+ &log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8, &log_scale);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < n_coeffs; index += 16) {
+ coeff0 = load_tran_low(coeff_ptr + index);
+ coeff1 = load_tran_low(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_test_all_zeros(all_zero, all_zero)) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), big_zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), big_zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), big_zero);
+ continue;
+ }
+
+ calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
+ calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_tran_low(qcoeff0, qcoeff_ptr + index);
+ store_tran_low(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_log_scale(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index, &log_scale);
+ calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
+ dqcoeff_ptr + index + 8, &log_scale);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
new file mode 100644
index 0000000000..22f2e696d3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_avx2.c
@@ -0,0 +1,1441 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_ports/mem.h"
+
+#if defined(__clang__)
+#if (__clang_major__ > 0 && __clang_major__ < 3) || \
+ (__clang_major__ == 3 && __clang_minor__ <= 3) || \
+ (defined(__APPLE__) && defined(__apple_build_version__) && \
+ ((__clang_major__ == 4 && __clang_minor__ <= 2) || \
+ (__clang_major__ == 5 && __clang_minor__ == 0)))
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#else // clang > 3.3, and not 5.0 on macosx.
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // clang <= 3.3
+#elif defined(__GNUC__)
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ <= 6)
+#define MM256_BROADCASTSI128_SI256(x) \
+ _mm_broadcastsi128_si256((__m128i const *)&(x))
+#elif __GNUC__ == 4 && __GNUC_MINOR__ == 7
+#define MM256_BROADCASTSI128_SI256(x) _mm_broadcastsi128_si256(x)
+#else // gcc > 4.7
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // gcc <= 4.6
+#else // !(gcc || clang)
+#define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
+#endif // __clang__
+
+static INLINE void xx_storeu2_epi32(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(_mm256_castsi256_si128(*a));
+ *((int *)(output_ptr + stride)) =
+ _mm_cvtsi128_si32(_mm256_extracti128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_epi64(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadl_epi64((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadl_epi64((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_storeu2_epi64(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_storel_epi64((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_storel_epi64((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+static INLINE void xx_store2_mi128(const uint8_t *output_ptr,
+ const ptrdiff_t stride, const __m256i *a) {
+ _mm_store_si128((__m128i *)output_ptr, _mm256_castsi256_si128(*a));
+ _mm_store_si128((__m128i *)(output_ptr + stride),
+ _mm256_extractf128_si256(*a, 1));
+}
+
+static void aom_filter_block1d4_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+ filt1Reg = _mm256_load_si256((__m256i const *)(filt4_d4_global_avx2));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d4_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg;
+ __m256i firstFilters, secondFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 32 bits
+ firstFilters = _mm256_shuffle_epi32(filtersReg32, 0);
+ // duplicate only the second 32 bits
+ secondFilters = _mm256_shuffle_epi32(filtersReg32, 0x55);
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_d4_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_d4_global_avx2 + 32));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ // filter the source buffer
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ srcRegFilt32b1_1 =
+ _mm256_hadds_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 4 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+ srcRegFilt1_1 = _mm_hadds_epi16(srcRegFilt1_1, _mm_setzero_si128());
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 4 bytes
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b1_1);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ const __m256i filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 =
+ _mm256_packus_epi16(srcRegFilt32b1_1, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 8 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcRegFilt1_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d16_h4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt2Reg, filt3Reg;
+ __m256i secondFilters, thirdFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m256i srcReg1, srcReg12;
+ __m256i srcRegFilt2, srcRegFilt3, srcRegFilt1_1;
+
+ srcReg1 = _mm256_loadu_si256((const __m256i *)(src_ptr));
+ srcReg12 = _mm256_permute4x64_epi64(srcReg1, 0x94);
+
+ // filter the source buffer
+ srcRegFilt2 = _mm256_shuffle_epi8(srcReg12, filt2Reg);
+ srcRegFilt3 = _mm256_shuffle_epi8(srcReg12, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2 = _mm256_maddubs_epi16(srcRegFilt2, secondFilters);
+ srcRegFilt3 = _mm256_maddubs_epi16(srcRegFilt3, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt2, srcRegFilt3);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 = _mm256_adds_epi16(srcRegFilt1_1, addFilterReg32);
+ srcRegFilt1_1 = _mm256_srai_epi16(srcRegFilt1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm256_packus_epi16(srcRegFilt1_1, srcRegFilt1_1);
+ srcRegFilt1_1 = _mm256_permute4x64_epi64(srcRegFilt1_1, 0x8);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr,
+ _mm256_castsi256_si128(srcRegFilt1_1));
+ }
+}
+
+static void aom_filter_block1d16_h8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m256i srcReg32b1, srcReg32b2, filtersReg32;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+ src_ptr -= 3;
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ filt1Reg = _mm256_load_si256((__m256i const *)filt_global_avx2);
+ filt2Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt3Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt4Reg = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pixels_per_line << 1;
+ dst_stride = output_pitch << 1;
+ for (i = output_height; i > 1; i -= 2) {
+ // load the 2 strides of source
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pixels_per_line, src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ __m256i sum23 = _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, sum23);
+
+ // reading 2 strides of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 =
+ xx_loadu2_mi128(src_ptr + src_pixels_per_line + 8, src_ptr + 8);
+
+ // filter the source buffer
+ srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm256_adds_epi16(
+ srcRegFilt32b2_1, _mm256_adds_epi16(srcRegFilt32b3, srcRegFilt32b2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, output_pitch, &srcRegFilt32b1_1);
+ output_ptr += dst_stride;
+ }
+
+ // if the number of strides is odd.
+ // process only 16 bytes
+ if (i > 0) {
+ __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
+ __m128i srcRegFilt2, srcRegFilt3;
+
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+
+ // filter the source buffer
+ srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1_1 =
+ _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // reading the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ // filter the source buffer
+ srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt2_1 =
+ _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);
+
+ // filter the source buffer
+ srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg));
+ srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt3 =
+ _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters));
+ srcRegFilt2 =
+ _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm_adds_epi16(srcRegFilt3, srcRegFilt2));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1_1 =
+ _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 6);
+
+ srcRegFilt2_1 =
+ _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1_1);
+ }
+}
+
+static void aom_filter_block1d8_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i resReg23_34_lo, resReg45_56_lo;
+ __m256i resReglo, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
+static void aom_filter_block1d8_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = xx_loadu2_epi64(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_epi64(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, _mm256_setzero_si256());
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi64(output_ptr, out_pitch, &srcReg32b1);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt4, srcRegFilt6, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, _mm_setzero_si128());
+
+ // save 8 bytes
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
+static void aom_filter_block1d16_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg23_34_hi, srcReg45_56_lo, srcReg45_56_hi;
+ __m256i resReg23_34_lo, resReg23_34_hi, resReg45_56_lo, resReg45_56_hi;
+ __m256i resReglo, resReghi, resReg;
+ __m256i secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+ srcReg23_34_hi = _mm256_unpackhi_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+ srcReg45_56_hi = _mm256_unpackhi_epi8(srcReg45, srcReg56);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_lo = _mm256_maddubs_epi16(srcReg23_34_lo, secondFilters);
+ resReg45_56_lo = _mm256_maddubs_epi16(srcReg45_56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReglo = _mm256_adds_epi16(resReg23_34_lo, resReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_34_hi = _mm256_maddubs_epi16(srcReg23_34_hi, secondFilters);
+ resReg45_56_hi = _mm256_maddubs_epi16(srcReg45_56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReghi = _mm256_adds_epi16(resReg23_34_hi, resReg45_56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm256_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+ resReghi = _mm256_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReghi);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg23_34_hi = srcReg45_56_hi;
+ srcReg4x = srcReg6x;
+ }
+}
+
+static void aom_filter_block1d16_v8_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i addFilterReg32;
+ __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
+ __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
+ __m256i srcReg32b11, srcReg32b12, filtersReg32;
+ __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ // duplicate only the first 16 bits (first and second byte)
+ // across 256 bit register
+ firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u));
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u));
+ // duplicate only the forth 16 bits (seventh and eighth byte)
+ // across 256 bit register
+ forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ // load 16 bytes 7 times in stride of src_pitch
+ srcReg32b1 = xx_loadu2_mi128(src_ptr + src_pitch, src_ptr);
+ srcReg32b3 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg32b5 =
+ xx_loadu2_mi128(src_ptr + src_pitch * 5, src_ptr + src_pitch * 4);
+ srcReg32b7 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
+
+ // have each consecutive loads on the same 256 register
+ srcReg32b2 = _mm256_permute2x128_si256(srcReg32b1, srcReg32b3, 0x21);
+ srcReg32b4 = _mm256_permute2x128_si256(srcReg32b3, srcReg32b5, 0x21);
+ srcReg32b6 = _mm256_permute2x128_si256(srcReg32b5, srcReg32b7, 0x21);
+ // merge every two consecutive registers except the last one
+ srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
+ srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);
+
+ // save
+ srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);
+ srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);
+ srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg32b8 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
+ srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
+ _mm256_castsi256_si128(srcReg32b8), 1);
+ srcReg32b9 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
+ srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
+ _mm256_castsi256_si128(srcReg32b9), 1);
+
+ // merge every two consecutive registers
+ // save
+ srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
+ srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
+ srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);
+
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
+ srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);
+
+ // add and saturate the results together
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
+ _mm256_adds_epi16(srcReg32b8, srcReg32b12));
+
+ // shift by 6 bit each 16 bit
+ srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg32);
+ srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg32);
+ srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 6);
+ srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);
+
+ src_ptr += src_stride;
+
+ xx_store2_mi128(output_ptr, out_pitch, &srcReg32b1);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg32b10 = srcReg32b11;
+ srcReg32b1 = srcReg32b3;
+ srcReg32b11 = srcReg32b2;
+ srcReg32b3 = srcReg32b5;
+ srcReg32b2 = srcReg32b4;
+ srcReg32b5 = srcReg32b7;
+ srcReg32b7 = srcReg32b9;
+ }
+ if (i > 0) {
+ __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
+ __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
+ // load the last 16 bytes
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+ // merge the last 2 results together
+ srcRegFilt4 =
+ _mm_unpacklo_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+ srcRegFilt7 =
+ _mm_unpackhi_epi8(_mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt4 =
+ _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters));
+ srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
+ _mm256_castsi256_si128(firstFilters));
+ srcRegFilt7 =
+ _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
+ srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
+ _mm256_castsi256_si128(secondFilters));
+ srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
+ _mm256_castsi256_si128(secondFilters));
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
+ _mm256_castsi256_si128(thirdFilters));
+ srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
+ _mm256_castsi256_si128(thirdFilters));
+
+ // add and saturate the results together
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm_adds_epi16(srcRegFilt4, srcRegFilt6));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm_adds_epi16(srcRegFilt5, srcRegFilt7));
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt1 =
+ _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt3 =
+ _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg32));
+ srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 6);
+ srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);
+
+ // save 16 bytes
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt1);
+ }
+}
+
+static void aom_filter_block1d4_v4_avx2(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m256i filtersReg32, addFilterReg32;
+ __m256i srcReg23, srcReg4x, srcReg34, srcReg5x, srcReg45, srcReg6x, srcReg56;
+ __m256i srcReg23_34_lo, srcReg45_56_lo;
+ __m256i srcReg2345_3456_lo;
+ __m256i resReglo, resReg;
+ __m256i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm256_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+ // have the same data in both lanes of a 256 bit register
+ filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);
+
+ firstFilters =
+ _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg23 = xx_loadu2_epi64(src_ptr + src_pitch * 3, src_ptr + src_pitch * 2);
+ srcReg4x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm256_permute2x128_si256(srcReg23, srcReg4x, 0x21);
+
+ srcReg23_34_lo = _mm256_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ // load the last 2 loads of 16 bytes and have every two
+ // consecutive loads in the same 256 bit register
+ srcReg5x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)));
+ srcReg45 =
+ _mm256_inserti128_si256(srcReg4x, _mm256_castsi256_si128(srcReg5x), 1);
+
+ srcReg6x = _mm256_castsi128_si256(
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)));
+ srcReg56 =
+ _mm256_inserti128_si256(srcReg5x, _mm256_castsi256_si128(srcReg6x), 1);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm256_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm256_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm256_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+
+ resReglo = _mm256_hadds_epi16(resReglo, _mm256_setzero_si256());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm256_adds_epi16(resReglo, addFilterReg32);
+ resReglo = _mm256_srai_epi16(resReglo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg = _mm256_packus_epi16(resReglo, resReglo);
+
+ src_ptr += src_stride;
+
+ xx_storeu2_epi32(output_ptr, out_pitch, &resReg);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4x = srcReg6x;
+ }
+}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+#define aom_filter_block1d4_v8_avx2 aom_filter_block1d4_v8_ssse3
+#define aom_filter_block1d16_v2_avx2 aom_filter_block1d16_v2_ssse3
+#define aom_filter_block1d16_h2_avx2 aom_filter_block1d16_h2_ssse3
+#define aom_filter_block1d8_v2_avx2 aom_filter_block1d8_v2_ssse3
+#define aom_filter_block1d8_h2_avx2 aom_filter_block1d8_h2_ssse3
+#define aom_filter_block1d4_v2_avx2 aom_filter_block1d4_v2_ssse3
+#define aom_filter_block1d4_h2_avx2 aom_filter_block1d4_h2_ssse3
+// void aom_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
new file mode 100644
index 0000000000..5c36b68727
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c
@@ -0,0 +1,569 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_ports/mem.h"
+
+void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1,
+ srcRegFilt32b2_2;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5);
+ __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+ __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // reading stride of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ ss_2 = _mm_srli_si128(srcReg32b2, 2);
+ ss_4 = _mm_srli_si128(srcReg32b2, 4);
+ ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_1, thirdFilters);
+ srcRegFilt32b2_1 = _mm_add_epi32(d1, d2);
+
+ ss_1 = _mm_srli_si128(srcReg32b2, 3);
+ ss_3 = _mm_srli_si128(srcReg32b2, 5);
+ ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128());
+ ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_1_2, secondFilters);
+ d2 = _mm_madd_epi16(ss_2_2, thirdFilters);
+ srcRegFilt32b2_2 = _mm_add_epi32(d1, d2);
+
+ res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+ res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2);
+ srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+ __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128());
+ __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+ __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128());
+ __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+ resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+ resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+ __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+ resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+ __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+ resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters);
+ resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters);
+ resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128());
+ __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters);
+ resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128());
+ __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters);
+ resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+ resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+ resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23_lo_1 = resReg45_lo_1;
+ resReg23_lo_2 = resReg45_lo_2;
+ resReg23_hi_1 = resReg45_hi_1;
+ resReg23_hi_2 = resReg45_hi_2;
+ resReg34_lo_1 = resReg56_lo_1;
+ resReg34_lo_2 = resReg56_lo_2;
+ resReg34_hi_1 = resReg56_hi_1;
+ resReg34_hi_2 = resReg56_hi_2;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ __m128i d1 = _mm_madd_epi16(ss_2, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+ ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+ d1 = _mm_madd_epi16(ss_3, secondFilters);
+ d2 = _mm_madd_epi16(ss_5, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg34_lo;
+ __m128i srcReg45_lo, srcReg56_lo;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_45_lo, resReg34_56_lo;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128());
+ __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128());
+ __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters);
+ resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters);
+ resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128());
+ __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters);
+ resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128());
+ __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128());
+ tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters);
+ resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128());
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23_lo_1 = resReg45_lo_1;
+ resReg23_lo_2 = resReg45_lo_2;
+ resReg34_lo_1 = resReg56_lo_1;
+ resReg34_lo_2 = resReg56_lo_2;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr, ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3);
+ __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5);
+
+ ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128());
+ ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128());
+ ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128());
+ ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128());
+
+ __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3);
+ __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5);
+
+ __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+ __m128i resReg23_34, resReg45_56;
+ __m128i resReg23_34_45_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ __m128i tmp_0, tmp_1;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128());
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128());
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ tmp_0 = _mm_madd_epi16(resReg23, secondFilters);
+ tmp_1 = _mm_madd_epi16(resReg34, secondFilters);
+ resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1);
+
+ __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128());
+ __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128());
+
+ tmp_0 = _mm_madd_epi16(resReg45, thirdFilters);
+ tmp_1 = _mm_madd_epi16(resReg56, thirdFilters);
+ resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1);
+
+ // add and saturate the results together
+ resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56);
+
+ // shift by 6 bit each 16 bit
+ resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32);
+ resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_34_45_56 =
+ _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56);
+ *((int *)(output_ptr + out_pitch)) =
+ _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ resReg23 = resReg45;
+ resReg34 = resReg56;
+ srcReg4 = srcReg6;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
new file mode 100644
index 0000000000..245fda1e94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -0,0 +1,847 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_ssse3.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/transpose_sse2.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
+ 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+static void aom_filter_block1d4_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+ filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
+
+ for (i = output_height; i > 0; i -= 1) {
+ // load the 2 strides of source
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
+
+ // multiply 4 adjacent elements with the filter and add the result
+ srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
+
+ srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d4_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32;
+ __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
+ srcReg6, srcReg56;
+ __m128i srcReg23_34_lo, srcReg45_56_lo;
+ __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
+ __m128i resReglo, resReghi;
+ __m128i firstFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
+
+ srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
+
+ // merge every two consecutive registers
+ srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
+
+ srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
+ srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
+ resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
+
+ resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
+ resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
+
+ // shift by 6 bit each 16 bit
+ resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
+ resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
+ resReglo = _mm_srai_epi16(resReglo, 6);
+ resReghi = _mm_srai_epi16(resReghi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReglo = _mm_packus_epi16(resReglo, resReglo);
+ resReghi = _mm_packus_epi16(resReghi, resReghi);
+
+ src_ptr += src_stride;
+
+ *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
+ *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_34_lo = srcReg45_56_lo;
+ srcReg4 = srcReg6;
+ }
+}
+
+static void aom_filter_block1d8_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt2Reg, filt3Reg;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+ filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+
+ // shrink to 8 bit each 16 bits
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d8_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23, srcReg34, srcReg45, srcReg56;
+ __m128i resReg23, resReg34, resReg45, resReg56;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
+ resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
+ resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
+ resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
+ resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
+ resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
+ resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
+ resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
+ resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23 = srcReg45;
+ srcReg34 = srcReg56;
+ srcReg4 = srcReg6;
+ }
+}
+
+static void aom_filter_block1d16_h4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
+ ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i addFilterReg32, filt2Reg, filt3Reg;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ // converting the 16 bit (short) to 8 bit (byte) and have the same data
+ // in both lanes of 128 bit register.
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 256 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 256 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
+ filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
+
+ for (i = output_height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // reading stride of the next 16 bytes
+ // (part of it was being read by earlier read)
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
+
+ // filter the source buffer
+ srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
+ srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
+ srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
+
+ // add and saturate the results together
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
+
+ // shift by 6 bit each 16 bit
+ srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
+ srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
+ srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
+ srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve result
+ srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
+
+ src_ptr += src_pixels_per_line;
+
+ _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
+
+ output_ptr += output_pitch;
+ }
+}
+
+static void aom_filter_block1d16_v4_ssse3(
+ const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
+ ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg32, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ addFilterReg32 = _mm_set1_epi16(32);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ // converting the 16 bit (short) to 8 bit (byte) and have the
+ // same data in both lanes of 128 bit register.
+ filtersReg = _mm_srai_epi16(filtersReg, 1);
+ filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
+
+ // duplicate only the second 16 bits (third and forth byte)
+ // across 128 bit register
+ secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
+ // duplicate only the third 16 bits (fifth and sixth byte)
+ // across 128 bit register
+ thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = out_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+
+ // have consecutive loads on the same 256 register
+ srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
+
+ for (i = output_height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+ resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
+ resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
+ resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
+ resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
+
+ // add and saturate the results together
+ resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
+
+ // shift by 6 bit each 16 bit
+ resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
+ resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
+ resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
+ resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
+ resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
+ resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
+ resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
+ resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
+
+ output_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg34_lo = srcReg56_lo;
+ srcReg23_hi = srcReg45_hi;
+ srcReg34_hi = srcReg56_hi;
+ srcReg4 = srcReg6;
+ }
+}
+
+static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
+ const __m128i *const s, const int16_t *const filter) {
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+ return convolve8_8_ssse3(s, f);
+}
+
+static void filter_horiz_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const x_filter) {
+ __m128i s[8], ss[4], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, ss);
+ temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void transpose8x8_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[8];
+
+ load_8bit_8x8(src, src_stride, s);
+ transpose_8bit_8x8(s, s);
+ store_8bit_8x8(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w8(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ // This function processes 8x8 areas. The intermediate height is not always
+ // a multiple of 8, so force it to be a multiple of 8 here.
+ y = h + (8 - (h & 0x7));
+
+ do {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 8) {
+ // process 8 src_x steps
+ for (z = 0; z < 8; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 8; ++i) {
+ temp[z * 8 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 8x8 filters values back to dst
+ transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
+ }
+
+ src += src_stride * 8;
+ dst += dst_stride * 8;
+ } while (y -= 8);
+}
+
+static void filter_horiz_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[4];
+ __m128i temp;
+
+ load_8bit_8x4(src, src_stride, s);
+ transpose_16bit_4x4(s, s);
+
+ temp = shuffle_filter_convolve8_8_ssse3(s, filter);
+ // shrink to 8 bit each 16 bits
+ temp = _mm_packus_epi16(temp, temp);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void transpose4x4_to_dst(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride) {
+ __m128i s[4];
+
+ load_8bit_4x4(src, src_stride, s);
+ s[0] = transpose_8bit_4x4(s);
+ s[1] = _mm_srli_si128(s[0], 4);
+ s[2] = _mm_srli_si128(s[0], 8);
+ s[3] = _mm_srli_si128(s[0], 12);
+ store_8bit_4x4(s, dst, dst_stride);
+}
+
+static void scaledconvolve_horiz_w4(const uint8_t *src,
+ const ptrdiff_t src_stride, uint8_t *dst,
+ const ptrdiff_t dst_stride,
+ const InterpKernel *const x_filters,
+ const int x0_q4, const int x_step_q4,
+ const int w, const int h) {
+ DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
+ int x, y, z;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; y += 4) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; x += 4) {
+ // process 4 src_x steps
+ for (z = 0; z < 4; ++z) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ if (x_q4 & SUBPEL_MASK) {
+ filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
+ } else {
+ int i;
+ for (i = 0; i < 4; ++i) {
+ temp[z * 4 + i] = src_x[i * src_stride + 3];
+ }
+ }
+ x_q4 += x_step_q4;
+ }
+
+ // transpose the 4x4 filters values back to dst
+ transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
+ }
+
+ src += src_stride * 4;
+ dst += dst_stride * 4;
+ }
+}
+
+static __m128i filter_vert_kernel(const __m128i *const s,
+ const int16_t *const filter) {
+ __m128i ss[4];
+ __m128i temp;
+
+ // 00 10 01 11 02 12 03 13
+ ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ // 20 30 21 31 22 32 23 33
+ ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ // 40 50 41 51 42 52 43 53
+ ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ // 60 70 61 71 62 72 63 73
+ ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
+
+ temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
+ // shrink to 8 bit each 16 bits
+ return _mm_packus_epi16(temp, temp);
+}
+
+static void filter_vert_w4_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8];
+ __m128i temp;
+
+ load_8bit_4x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 4 bytes
+ *(int *)dst = _mm_cvtsi128_si32(temp);
+}
+
+static void scaledconvolve_vert_w4(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w8_ssse3(const uint8_t *const src,
+ const ptrdiff_t src_stride, uint8_t *const dst,
+ const int16_t *const filter) {
+ __m128i s[8], temp;
+
+ load_8bit_8x8(src, src_stride, s);
+ temp = filter_vert_kernel(s, filter);
+ // save only 8 bytes convolve result
+ _mm_storel_epi64((__m128i *)dst, temp);
+}
+
+static void scaledconvolve_vert_w8(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+static void filter_vert_w16_ssse3(const uint8_t *src,
+ const ptrdiff_t src_stride,
+ uint8_t *const dst,
+ const int16_t *const filter, const int w) {
+ int i;
+ __m128i f[4];
+ shuffle_filter_ssse3(filter, f);
+
+ for (i = 0; i < w; i += 16) {
+ __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
+
+ loadu_8bit_16x8(src, src_stride, s);
+
+ // merge the result together
+ s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
+ s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
+ s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
+ s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
+ s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
+ s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
+ s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
+ s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
+ temp_lo = convolve8_8_ssse3(s_lo, f);
+ temp_hi = convolve8_8_ssse3(s_hi, f);
+
+ // shrink to 8 bit each 16 bits, the first lane contain the first convolve
+ // result and the second lane contain the second convolve result
+ temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
+ src += 16;
+ // save 16 bytes convolve result
+ _mm_store_si128((__m128i *)&dst[i], temp_hi);
+ }
+}
+
+static void scaledconvolve_vert_w16(
+ const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
+ const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
+ const int y0_q4, const int y_step_q4, const int w, const int h) {
+ int y;
+ int y_q4 = y0_q4;
+
+ src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+ for (y = 0; y < h; ++y) {
+ const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+ const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+ if (y_q4 & SUBPEL_MASK) {
+ filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
+ w);
+ } else {
+ memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
+ }
+ y_q4 += y_step_q4;
+ }
+}
+
+void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+ ptrdiff_t dst_stride, const InterpKernel *filter,
+ int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+ // 2d filtering proceeds in 2 steps:
+ // (1) Interpolate horizontally into an intermediate buffer, temp.
+ // (2) Interpolate temp vertically to derive the sub-pixel result.
+ // Deriving the maximum number of rows in the temp buffer (135):
+ // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+ // --Largest block size is 64x64 pixels.
+ // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+ // original frame (in 1/16th pixel units).
+ // --Must round-up because block may be located at sub-pixel position.
+ // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+ // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+ // --Require an additional 8 rows for the horiz_w8 transpose tail.
+ // When calling in frame scaling function, the smallest scaling factor is x1/4
+ // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
+ // big enough.
+ DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+ const int intermediate_height =
+ (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+ assert(w <= 64);
+ assert(h <= 64);
+ assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
+ assert(x_step_q4 <= 64);
+
+ if (w >= 8) {
+ scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ } else {
+ scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+ src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
+ intermediate_height);
+ }
+
+ if (w >= 16) {
+ scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else if (w == 8) {
+ scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ } else {
+ scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
+ dst_stride, filter, y0_q4, y_step_q4, w, h);
+ }
+}
+
+filter8_1dfunction aom_filter_block1d16_v8_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_ssse3;
+
+filter8_1dfunction aom_filter_block1d16_v2_ssse3;
+filter8_1dfunction aom_filter_block1d16_h2_ssse3;
+filter8_1dfunction aom_filter_block1d8_v2_ssse3;
+filter8_1dfunction aom_filter_block1d8_h2_ssse3;
+filter8_1dfunction aom_filter_block1d4_v2_ssse3;
+filter8_1dfunction aom_filter_block1d4_h2_ssse3;
+
+// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
new file mode 100644
index 0000000000..640c5b2416
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_sse2.asm
@@ -0,0 +1,615 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+;Note: tap3 and tap4 have to be applied and added after other taps to avoid
+;overflow.
+
+%macro GET_FILTERS_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ psrldq xmm7, 8
+ pshuflw xmm4, xmm7, 0b ;k4
+ pshuflw xmm5, xmm7, 01010101b ;k5
+ pshuflw xmm6, xmm7, 10101010b ;k6
+ pshuflw xmm7, xmm7, 11111111b ;k7
+
+ punpcklqdq xmm0, xmm1
+ punpcklqdq xmm2, xmm3
+ punpcklqdq xmm5, xmm4
+ punpcklqdq xmm6, xmm7
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm2
+ movdqa k5k4, xmm5
+ movdqa k6k7, xmm6
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpckldq xmm6, xmm7
+ punpckldq xmm2, xmm3
+ punpckldq xmm5, xmm4
+
+ punpcklbw xmm0, zero ;unpack to word
+ punpcklbw xmm6, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+
+ pmullw xmm0, k0k1 ;multiply the filter factors
+ pmullw xmm6, k6k7
+ pmullw xmm2, k2k3
+ pmullw xmm5, k5k4
+
+ paddsw xmm0, xmm6 ;sum
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm2
+ psrldq xmm2, 8
+ paddsw xmm0, xmm5
+ psrldq xmm5, 8
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+%endm
+
+%macro GET_FILTERS 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+ pshuflw xmm0, xmm7, 0b ;k0
+ pshuflw xmm1, xmm7, 01010101b ;k1
+ pshuflw xmm2, xmm7, 10101010b ;k2
+ pshuflw xmm3, xmm7, 11111111b ;k3
+ pshufhw xmm4, xmm7, 0b ;k4
+ pshufhw xmm5, xmm7, 01010101b ;k5
+ pshufhw xmm6, xmm7, 10101010b ;k6
+ pshufhw xmm7, xmm7, 11111111b ;k7
+
+ punpcklwd xmm0, xmm0
+ punpcklwd xmm1, xmm1
+ punpcklwd xmm2, xmm2
+ punpcklwd xmm3, xmm3
+ punpckhwd xmm4, xmm4
+ punpckhwd xmm5, xmm5
+ punpckhwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movdqa k0, xmm0 ;store filter factors on stack
+ movdqa k1, xmm1
+ movdqa k2, xmm2
+ movdqa k3, xmm3
+ movdqa k4, xmm4
+ movdqa k5, xmm5
+ movdqa k6, xmm6
+ movdqa k7, xmm7
+
+ movq xmm6, rcx
+ pshufd xmm6, xmm6, 0
+ movdqa krd, xmm6 ;rounding
+
+ pxor xmm7, xmm7
+ movdqa zero, xmm7
+%endm
+
+%macro LOAD_VERT_8 1
+ movq xmm0, [rsi + %1] ;0
+ movq xmm1, [rsi + rax + %1] ;1
+ movq xmm6, [rsi + rdx * 2 + %1] ;6
+ lea rsi, [rsi + rax]
+ movq xmm7, [rsi + rdx * 2 + %1] ;7
+ movq xmm2, [rsi + rax + %1] ;2
+ movq xmm3, [rsi + rax * 2 + %1] ;3
+ movq xmm4, [rsi + rdx + %1] ;4
+ movq xmm5, [rsi + rax * 4 + %1] ;5
+%endm
+
+%macro APPLY_FILTER_8 2
+ punpcklbw xmm0, zero
+ punpcklbw xmm1, zero
+ punpcklbw xmm6, zero
+ punpcklbw xmm7, zero
+ punpcklbw xmm2, zero
+ punpcklbw xmm5, zero
+ punpcklbw xmm3, zero
+ punpcklbw xmm4, zero
+
+ pmullw xmm0, k0
+ pmullw xmm1, k1
+ pmullw xmm6, k6
+ pmullw xmm7, k7
+ pmullw xmm2, k2
+ pmullw xmm5, k5
+ pmullw xmm3, k3
+ pmullw xmm4, k4
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm6
+ paddsw xmm0, xmm7
+ paddsw xmm0, xmm2
+ paddsw xmm0, xmm5
+ paddsw xmm0, xmm3
+ paddsw xmm0, xmm4
+
+ paddsw xmm0, krd ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi + %2]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi + %2], xmm0
+%endm
+
+SECTION .text
+
+;void aom_filter_block1d4_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d4_v8_sse2)
+sym(aom_filter_block1d4_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movd xmm0, [rsi] ;load src: row 0
+ movd xmm1, [rsi + rax] ;1
+ movd xmm6, [rsi + rdx * 2] ;6
+ lea rsi, [rsi + rax]
+ movd xmm7, [rsi + rdx * 2] ;7
+ movd xmm2, [rsi + rax] ;2
+ movd xmm3, [rsi + rax * 2] ;3
+ movd xmm4, [rsi + rdx] ;4
+ movd xmm5, [rsi + rax * 4] ;5
+
+ APPLY_FILTER_4 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d8_v8_sse2)
+sym(aom_filter_block1d8_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+
+ lea rdi, [rdi + rbx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_v8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d16_v8_sse2)
+sym(aom_filter_block1d16_v8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rbx, DWORD PTR arg(3) ;out_pitch
+ lea rdx, [rax + rax * 2]
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ LOAD_VERT_8 0
+ APPLY_FILTER_8 0, 0
+ sub rsi, rax
+
+ LOAD_VERT_8 8
+ APPLY_FILTER_8 0, 8
+ add rdi, rbx
+
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d4_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d4_h8_sse2)
+sym(aom_filter_block1d4_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 6
+ %define k0k1 [rsp + 16 * 0]
+ %define k2k3 [rsp + 16 * 1]
+ %define k5k4 [rsp + 16 * 2]
+ %define k6k7 [rsp + 16 * 3]
+ %define krd [rsp + 16 * 4]
+ %define zero [rsp + 16 * 5]
+
+ GET_FILTERS_4
+
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm3, 3
+ psrldq xmm5, 5
+ psrldq xmm4, 4
+
+ APPLY_FILTER_4 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 6
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d8_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d8_h8_sse2)
+sym(aom_filter_block1d8_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void aom_filter_block1d16_h8_sse2
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+globalsym(aom_filter_block1d16_h8_sse2)
+sym(aom_filter_block1d16_h8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16 * 10
+ %define k0 [rsp + 16 * 0]
+ %define k1 [rsp + 16 * 1]
+ %define k2 [rsp + 16 * 2]
+ %define k3 [rsp + 16 * 3]
+ %define k4 [rsp + 16 * 4]
+ %define k5 [rsp + 16 * 5]
+ %define k6 [rsp + 16 * 6]
+ %define k7 [rsp + 16 * 7]
+ %define krd [rsp + 16 * 8]
+ %define zero [rsp + 16 * 9]
+
+ GET_FILTERS
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+
+.loop:
+ movdqu xmm0, [rsi - 3] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 0
+
+ movdqu xmm0, [rsi + 5] ;load src
+
+ movdqa xmm1, xmm0
+ movdqa xmm6, xmm0
+ movdqa xmm7, xmm0
+ movdqa xmm2, xmm0
+ movdqa xmm5, xmm0
+ movdqa xmm3, xmm0
+ movdqa xmm4, xmm0
+
+ psrldq xmm1, 1
+ psrldq xmm6, 6
+ psrldq xmm7, 7
+ psrldq xmm2, 2
+ psrldq xmm5, 5
+ psrldq xmm3, 3
+ psrldq xmm4, 4
+
+ APPLY_FILTER_8 0, 8
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+
+ add rsp, 16 * 10
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
new file mode 100644
index 0000000000..e5fafb0302
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -0,0 +1,870 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_64: times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
+
+; %define USE_PMULHRSW
+; NOTE: pmulhrsw has a latency of 5 cycles. Tests showed a performance loss
+; when using this instruction.
+;
+; The add order below (based on ffav1) must be followed to prevent outranges.
+; x = k0k1 + k4k5
+; y = k2k3 + k6k7
+; z = signed SAT(x + y)
+
+SECTION .text
+%define LOCAL_VARS_SIZE 16*6
+
+%macro SETUP_LOCAL_VARS 0
+ ; TODO(slavarnway): using xmm registers for these on AOM_ARCH_X86_64 +
+ ; pmaddubsw has a higher latency on some platforms, this might be eased by
+ ; interleaving the instructions.
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ packsswb m4, m4
+ ; TODO(slavarnway): multiple pshufb instructions had a higher latency on
+ ; some platforms.
+ pshuflw m0, m4, 0b ;k0_k1
+ pshuflw m1, m4, 01010101b ;k2_k3
+ pshuflw m2, m4, 10101010b ;k4_k5
+ pshuflw m3, m4, 11111111b ;k6_k7
+ punpcklqdq m0, m0
+ punpcklqdq m1, m1
+ punpcklqdq m2, m2
+ punpcklqdq m3, m3
+ mova k0k1, m0
+ mova k2k3, m1
+ mova k4k5, m2
+ mova k6k7, m3
+%if AOM_ARCH_X86_64
+ %define krd m12
+ %define tmp0 [rsp + 16*4]
+ %define tmp1 [rsp + 16*5]
+ mova krd, [GLOBAL(pw_64)]
+%else
+ %define krd [rsp + 16*4]
+%if CONFIG_PIC=0
+ mova m6, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m6, m6 ;all ones
+ psrlw m6, 15
+ psllw m6, 6 ;aka pw_64
+%endif
+ mova krd, m6
+%endif
+%endm
+
+;-------------------------------------------------------------------------------
+%if AOM_ARCH_X86_64
+ %define LOCAL_VARS_SIZE_H4 0
+%else
+ %define LOCAL_VARS_SIZE_H4 16*4
+%endif
+
+%macro SUBPIX_HFILTER4 1
+cglobal filter_block1d4_%1, 6, 6, 11, LOCAL_VARS_SIZE_H4, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ packsswb m4, m4
+%if AOM_ARCH_X86_64
+ %define k0k1k4k5 m8
+ %define k2k3k6k7 m9
+ %define krd m10
+ mova krd, [GLOBAL(pw_64)]
+ pshuflw k0k1k4k5, m4, 0b ;k0_k1
+ pshufhw k0k1k4k5, k0k1k4k5, 10101010b ;k0_k1_k4_k5
+ pshuflw k2k3k6k7, m4, 01010101b ;k2_k3
+ pshufhw k2k3k6k7, k2k3k6k7, 11111111b ;k2_k3_k6_k7
+%else
+ %define k0k1k4k5 [rsp + 16*0]
+ %define k2k3k6k7 [rsp + 16*1]
+ %define krd [rsp + 16*2]
+ pshuflw m6, m4, 0b ;k0_k1
+ pshufhw m6, m6, 10101010b ;k0_k1_k4_k5
+ pshuflw m7, m4, 01010101b ;k2_k3
+ pshufhw m7, m7, 11111111b ;k2_k3_k6_k7
+%if CONFIG_PIC=0
+ mova m1, [GLOBAL(pw_64)]
+%else
+ ; build constants without accessing global memory
+ pcmpeqb m1, m1 ;all ones
+ psrlw m1, 15
+ psllw m1, 6 ;aka pw_64
+%endif
+ mova k0k1k4k5, m6
+ mova k2k3k6k7, m7
+ mova krd, m1
+%endif
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m4, [srcq - 3]
+ movu m5, [srcq + sstrideq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ punpckhbw m3, m5, m5
+ punpcklbw m5, m5
+ palignr m0, m1, m4, 1
+ pmaddubsw m0, k0k1k4k5
+ palignr m1, m4, 5
+ pmaddubsw m1, k2k3k6k7
+ palignr m2, m3, m5, 1
+ pmaddubsw m2, k0k1k4k5
+ palignr m3, m5, 5
+ pmaddubsw m3, k2k3k6k7
+ punpckhqdq m4, m0, m2
+ punpcklqdq m0, m2
+ punpckhqdq m5, m1, m3
+ punpcklqdq m1, m3
+ paddsw m0, m4
+ paddsw m1, m5
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ movd m5, [dstq + dstrideq]
+%endif
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpckldq m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+ psrldq m1, m0, 4
+
+%ifidn %1, h8_avg
+ pavgb m0, m4
+ pavgb m1, m5
+%endif
+ movd [dstq], m0
+ movd [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m4, [srcq - 3]
+ punpckhbw m1, m4, m4
+ punpcklbw m4, m4
+ palignr m0, m1, m4, 1
+ palignr m1, m4, 5
+ pmaddubsw m0, k0k1k4k5
+ pmaddubsw m1, k2k3k6k7
+ psrldq m2, m0, 8
+ psrldq m3, m1, 8
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ punpcklbw m4, m3
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, h8_avg
+ movd m4, [dstq]
+ pavgb m0, m4
+%endif
+ movd [dstq], m0
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER8 1
+cglobal filter_block1d8_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+ dec heightd
+
+.loop:
+ ;Do two rows at once
+ movu m0, [srcq - 3]
+ movu m4, [srcq + sstrideq - 3]
+ punpckhbw m1, m0, m0
+ punpcklbw m0, m0
+ palignr m5, m1, m0, 13
+ pmaddubsw m5, k6k7
+ palignr m2, m1, m0, 5
+ palignr m3, m1, m0, 9
+ palignr m1, m0, 1
+ pmaddubsw m1, k0k1
+ punpckhbw m6, m4, m4
+ punpcklbw m4, m4
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+
+ palignr m7, m6, m4, 13
+ palignr m0, m6, m4, 5
+ pmaddubsw m7, k6k7
+ paddsw m1, m3
+ paddsw m2, m5
+ paddsw m1, m2
+%ifidn %1, h8_avg
+ movh m2, [dstq]
+ movhps m2, [dstq + dstrideq]
+%endif
+ palignr m5, m6, m4, 9
+ palignr m6, m4, 1
+ pmaddubsw m0, k2k3
+ pmaddubsw m6, k0k1
+ paddsw m1, krd
+ pmaddubsw m5, k4k5
+ psraw m1, 7
+ paddsw m0, m7
+ paddsw m6, m5
+ paddsw m6, m0
+ paddsw m6, krd
+ psraw m6, 7
+%ifidn %1, h8_add_src
+ pxor m3, m3
+ movu m4, [srcq]
+ movu m5, [srcq + sstrideq]
+ punpcklbw m4, m3
+ punpcklbw m5, m3
+ paddsw m1, m4
+ paddsw m6, m5
+%endif
+ packuswb m1, m6
+%ifidn %1, h8_avg
+ pavgb m1, m2
+%endif
+ movh [dstq], m1
+ movhps [dstq + dstrideq], m1
+
+ lea srcq, [srcq + sstrideq ]
+ prefetcht0 [srcq + 4 * sstrideq - 3]
+ lea srcq, [srcq + sstrideq ]
+ lea dstq, [dstq + 2 * dstrideq ]
+ prefetcht0 [srcq + 2 * sstrideq - 3]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m0, [srcq - 3]
+ punpckhbw m3, m0, m0
+ punpcklbw m0, m0
+ palignr m1, m3, m0, 1
+ palignr m2, m3, m0, 5
+ palignr m4, m3, m0, 13
+ palignr m3, m0, 9
+ pmaddubsw m1, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m3, k4k5
+ pmaddubsw m4, k6k7
+ paddsw m1, m3
+ paddsw m4, m2
+ paddsw m1, m4
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, h8_add_src
+ pxor m6, m6
+ movu m5, [srcq]
+ punpcklbw m5, m6
+ paddsw m1, m5
+%endif
+ packuswb m1, m1
+%ifidn %1, h8_avg
+ movh m0, [dstq]
+ pavgb m1, m0
+%endif
+ movh [dstq], m1
+.done:
+ REP_RET
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_HFILTER16 1
+cglobal filter_block1d16_%1, 6, 6, 14, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+.loop:
+ prefetcht0 [srcq + 2 * sstrideq -3]
+
+ movu m0, [srcq - 3]
+ movu m4, [srcq - 2]
+ pmaddubsw m0, k0k1
+ pmaddubsw m4, k0k1
+ movu m1, [srcq - 1]
+ movu m5, [srcq + 0]
+ pmaddubsw m1, k2k3
+ pmaddubsw m5, k2k3
+ movu m2, [srcq + 1]
+ movu m6, [srcq + 2]
+ pmaddubsw m2, k4k5
+ pmaddubsw m6, k4k5
+ movu m3, [srcq + 3]
+ movu m7, [srcq + 4]
+ pmaddubsw m3, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m1, m3
+ paddsw m0, m1
+ paddsw m4, m6
+ paddsw m5, m7
+ paddsw m4, m5
+ paddsw m0, krd
+ paddsw m4, krd
+ psraw m0, 7
+ psraw m4, 7
+%ifidn %1, h8_add_src
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ pcmpeqb m2, m2 ;all ones
+ psrlw m2, 8 ;even_byte_mask
+%else
+ mova m2, [GLOBAL(even_byte_mask)]
+%endif
+ movu m5, [srcq]
+ mova m7, m5
+ pand m5, m2
+ psrlw m7, 8
+ paddsw m0, m5
+ paddsw m4, m7
+%endif
+ packuswb m0, m0
+ packuswb m4, m4
+ punpcklbw m0, m4
+%ifidn %1, h8_avg
+ pavgb m0, [dstq]
+%endif
+ lea srcq, [srcq + sstrideq]
+ mova [dstq], m0
+ lea dstq, [dstq + dstrideq]
+ dec heightd
+ jnz .loop
+ REP_RET
+%endm
+
+INIT_XMM ssse3
+SUBPIX_HFILTER16 h8
+SUBPIX_HFILTER8 h8
+SUBPIX_HFILTER4 h8
+
+;-------------------------------------------------------------------------------
+
+; TODO(Linfeng): Detect cpu type and choose the code with better performance.
+%define X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON 1
+
+%if AOM_ARCH_X86_64 && X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+ %define NUM_GENERAL_REG_USED 9
+%else
+ %define NUM_GENERAL_REG_USED 6
+%endif
+
+%macro SUBPIX_VFILTER 2
+cglobal filter_block1d%2_%1, 6, NUM_GENERAL_REG_USED, 15, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%ifidn %2, 8
+ %define movx movh
+%else
+ %define movx movd
+%endif
+
+ dec heightd
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ mov src1q, srcq
+ add src1q, sstrideq
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ ;Do two rows at once
+ movx m0, [srcq ] ;A
+ movx m1, [src1q ] ;B
+ punpcklbw m0, m1 ;A B
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ pmaddubsw m0, k0k1
+ mova m6, m2
+ movx m3, [src1q + sstrideq * 2] ;D
+ punpcklbw m2, m3 ;C D
+ pmaddubsw m2, k2k3
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ mova m7, m4
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m4, k4k5
+ punpcklbw m1, m6 ;A B next iter
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m5, m6 ;E F next iter
+ punpcklbw m3, m7 ;C D next iter
+ pmaddubsw m5, k4k5
+ movx m7, [src1q + sstride6q ] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m6, k6k7
+ pmaddubsw m3, k2k3
+ pmaddubsw m1, k0k1
+ paddsw m0, m4
+ paddsw m2, m6
+ movx m6, [srcq + sstrideq * 8 ] ;H next iter
+ punpcklbw m7, m6
+ pmaddubsw m7, k6k7
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ paddsw m1, m5
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+
+ paddsw m3, m7
+ paddsw m1, m3
+ paddsw m1, krd
+ psraw m1, 7
+%ifidn %1, v8_add_src
+ movu m4, [src1q]
+ punpcklbw m4, m6
+ paddsw m1, m4
+%endif
+ lea srcq, [srcq + sstrideq * 2 ]
+ lea src1q, [src1q + sstrideq * 2]
+ packuswb m1, m1
+
+%ifidn %1, v8_avg
+ movx m2, [dstq]
+ pavgb m0, m2
+%endif
+ movx [dstq], m0
+ add dstq, dst_stride
+%ifidn %1, v8_avg
+ movx m3, [dstq]
+ pavgb m1, m3
+%endif
+ movx [dstq], m1
+ add dstq, dst_stride
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ movx m6, [srcq + sstride6q ] ;G
+ punpcklbw m0, m1 ;A B
+ movx m7, [src1q + sstride6q ] ;H
+ pmaddubsw m0, k0k1
+ movx m2, [srcq + sstrideq * 2 ] ;C
+ punpcklbw m6, m7 ;G H
+ movx m3, [src1q + sstrideq * 2] ;D
+ pmaddubsw m6, k6k7
+ movx m4, [srcq + sstrideq * 4 ] ;E
+ punpcklbw m2, m3 ;C D
+ movx m5, [src1q + sstrideq * 4] ;F
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ paddsw m2, m6
+ paddsw m0, m4
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [srcq]
+ punpcklbw m4, m6
+ paddsw m0, m4
+%endif
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%else
+ ; AOM_ARCH_X86_64
+
+ movx m0, [srcq ] ;A
+ movx m1, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m2, [srcq] ;C
+ movx m3, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m4, [srcq] ;E
+ movx m5, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m6, [srcq] ;G
+ punpcklbw m0, m1 ;A B
+ punpcklbw m1, m2 ;A B next iter
+ punpcklbw m2, m3 ;C D
+ punpcklbw m3, m4 ;C D next iter
+ punpcklbw m4, m5 ;E F
+ punpcklbw m5, m6 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ movx m7, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2 ]
+ movx m14, [srcq] ;H next iter
+ punpcklbw m6, m7 ;G H
+ punpcklbw m7, m14 ;G H next iter
+ pmaddubsw m8, m0, k0k1
+ pmaddubsw m9, m1, k0k1
+ mova m0, m2
+ mova m1, m3
+ pmaddubsw m10, m2, k2k3
+ pmaddubsw m11, m3, k2k3
+ mova m2, m4
+ mova m3, m5
+ pmaddubsw m4, k4k5
+ pmaddubsw m5, k4k5
+ paddsw m8, m4
+ paddsw m9, m5
+ mova m4, m6
+ mova m5, m7
+ pmaddubsw m6, k6k7
+ pmaddubsw m7, k6k7
+ paddsw m10, m6
+ paddsw m11, m7
+ paddsw m8, m10
+ paddsw m9, m11
+ mova m6, m14
+ paddsw m8, krd
+ paddsw m9, krd
+ psraw m8, 7
+ psraw m9, 7
+%ifidn %2, 4
+ packuswb m8, m8
+ packuswb m9, m9
+%else
+ packuswb m8, m9
+%endif
+
+%ifidn %1, v8_avg
+ movx m7, [dstq]
+%ifidn %2, 4
+ movx m10, [dstq + dstrideq]
+ pavgb m9, m10
+%else
+ movhpd m7, [dstq + dstrideq]
+%endif
+ pavgb m8, m7
+%endif
+ movx [dstq], m8
+%ifidn %2, 4
+ movx [dstq + dstrideq], m9
+%else
+ movhpd [dstq + dstrideq], m8
+%endif
+
+ lea dstq, [dstq + dstrideq * 2 ]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movx m7, [srcq + sstrideq] ;H
+ punpcklbw m6, m7 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m2, k2k3
+ pmaddubsw m4, k4k5
+ pmaddubsw m6, k6k7
+ paddsw m0, m4
+ paddsw m2, m6
+ paddsw m0, m2
+ paddsw m0, krd
+ psraw m0, 7
+ packuswb m0, m0
+%ifidn %1, v8_avg
+ movx m1, [dstq]
+ pavgb m0, m1
+%endif
+ movx [dstq], m0
+
+%endif ; AOM_ARCH_X86_64
+
+.done:
+ REP_RET
+
+%endm
+
+;-------------------------------------------------------------------------------
+%macro SUBPIX_VFILTER16 1
+cglobal filter_block1d16_%1, 6, NUM_GENERAL_REG_USED, 16, LOCAL_VARS_SIZE, \
+ src, sstride, dst, dstride, height, filter
+ mova m4, [filterq]
+ SETUP_LOCAL_VARS
+
+%if AOM_ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON
+
+%if AOM_ARCH_X86_64
+ %define src1q r7
+ %define sstride6q r8
+ %define dst_stride dstrideq
+%else
+ %define src1q filterq
+ %define sstride6q dstrideq
+ %define dst_stride dstridemp
+%endif
+ lea src1q, [srcq + sstrideq]
+ lea sstride6q, [sstrideq + sstrideq * 4]
+ add sstride6q, sstrideq ;pitch * 6
+
+.loop:
+ movh m0, [srcq ] ;A
+ movh m1, [src1q ] ;B
+ movh m2, [srcq + sstrideq * 2 ] ;C
+ movh m3, [src1q + sstrideq * 2] ;D
+ movh m4, [srcq + sstrideq * 4 ] ;E
+ movh m5, [src1q + sstrideq * 4] ;F
+
+ punpcklbw m0, m1 ;A B
+ movh m6, [srcq + sstride6q] ;G
+ punpcklbw m2, m3 ;C D
+ movh m7, [src1q + sstride6q] ;H
+ punpcklbw m4, m5 ;E F
+ pmaddubsw m0, k0k1
+ movh m3, [srcq + 8] ;A
+ pmaddubsw m2, k2k3
+ punpcklbw m6, m7 ;G H
+ movh m5, [srcq + sstrideq + 8] ;B
+ pmaddubsw m4, k4k5
+ punpcklbw m3, m5 ;A B
+ movh m7, [srcq + sstrideq * 2 + 8] ;C
+ pmaddubsw m6, k6k7
+ movh m5, [src1q + sstrideq * 2 + 8] ;D
+ punpcklbw m7, m5 ;C D
+ paddsw m2, m6
+ pmaddubsw m3, k0k1
+ movh m1, [srcq + sstrideq * 4 + 8] ;E
+ paddsw m0, m4
+ pmaddubsw m7, k2k3
+ movh m6, [src1q + sstrideq * 4 + 8] ;F
+ punpcklbw m1, m6 ;E F
+ paddsw m0, m2
+ paddsw m0, krd
+ movh m2, [srcq + sstride6q + 8] ;G
+ pmaddubsw m1, k4k5
+ movh m5, [src1q + sstride6q + 8] ;H
+ psraw m0, 7
+ punpcklbw m2, m5 ;G H
+ pmaddubsw m2, k6k7
+ paddsw m7, m2
+ paddsw m3, m1
+ paddsw m3, m7
+ paddsw m3, krd
+ psraw m3, 7
+%ifidn %1, v8_add_src
+ pxor m6, m6
+ movu m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+ mova m5, m4
+ punpcklbw m4, m6
+ punpckhbw m5, m6
+ paddsw m0, m4
+ paddsw m3, m5
+%endif
+ packuswb m0, m3
+
+ add srcq, sstrideq
+ add src1q, sstrideq
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+ add dstq, dst_stride
+ dec heightd
+ jnz .loop
+ REP_RET
+
+%else
+ ; AOM_ARCH_X86_64
+ dec heightd
+
+ movu m1, [srcq ] ;A
+ movu m3, [srcq + sstrideq ] ;B
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m0, m1, m3 ;A B
+ punpckhbw m1, m3 ;A B
+ movu m5, [srcq] ;C
+ punpcklbw m2, m3, m5 ;A B next iter
+ punpckhbw m3, m5 ;A B next iter
+ mova tmp0, m2 ;store to stack
+ mova tmp1, m3 ;store to stack
+ movu m7, [srcq + sstrideq] ;D
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m4, m5, m7 ;C D
+ punpckhbw m5, m7 ;C D
+ movu m9, [srcq] ;E
+ punpcklbw m6, m7, m9 ;C D next iter
+ punpckhbw m7, m9 ;C D next iter
+ movu m11, [srcq + sstrideq] ;F
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m8, m9, m11 ;E F
+ punpckhbw m9, m11 ;E F
+ movu m2, [srcq] ;G
+ punpcklbw m10, m11, m2 ;E F next iter
+ punpckhbw m11, m2 ;E F next iter
+
+.loop:
+ ;Do two rows at once
+ pmaddubsw m13, m0, k0k1
+ mova m0, m4
+ pmaddubsw m14, m8, k4k5
+ pmaddubsw m15, m4, k2k3
+ mova m4, m8
+ paddsw m13, m14
+ movu m3, [srcq + sstrideq] ;H
+ lea srcq, [srcq + sstrideq * 2]
+ punpcklbw m14, m2, m3 ;G H
+ mova m8, m14
+ pmaddubsw m14, k6k7
+ paddsw m15, m14
+ paddsw m13, m15
+ paddsw m13, krd
+ psraw m13, 7
+
+ pmaddubsw m14, m1, k0k1
+ pmaddubsw m1, m9, k4k5
+ pmaddubsw m15, m5, k2k3
+ paddsw m14, m1
+ mova m1, m5
+ mova m5, m9
+ punpckhbw m2, m3 ;G H
+ mova m9, m2
+ pmaddubsw m2, k6k7
+ paddsw m15, m2
+ paddsw m14, m15
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m13, m14
+%ifidn %1, v8_avg
+ pavgb m13, [dstq]
+%endif
+ mova [dstq], m13
+
+ ; next iter
+ pmaddubsw m15, tmp0, k0k1
+ pmaddubsw m14, m10, k4k5
+ pmaddubsw m13, m6, k2k3
+ paddsw m15, m14
+ mova tmp0, m6
+ mova m6, m10
+ movu m2, [srcq] ;G next iter
+ punpcklbw m14, m3, m2 ;G H next iter
+ mova m10, m14
+ pmaddubsw m14, k6k7
+ paddsw m13, m14
+ paddsw m15, m13
+ paddsw m15, krd
+ psraw m15, 7
+
+ pmaddubsw m14, tmp1, k0k1
+ mova tmp1, m7
+ pmaddubsw m13, m7, k2k3
+ mova m7, m11
+ pmaddubsw m11, k4k5
+ paddsw m14, m11
+ punpckhbw m3, m2 ;G H next iter
+ mova m11, m3
+ pmaddubsw m3, k6k7
+ paddsw m13, m3
+ paddsw m14, m13
+ paddsw m14, krd
+ psraw m14, 7
+ packuswb m15, m14
+%ifidn %1, v8_avg
+ pavgb m15, [dstq + dstrideq]
+%endif
+ mova [dstq + dstrideq], m15
+ lea dstq, [dstq + dstrideq * 2]
+ sub heightd, 2
+ jg .loop
+
+ ; Do last row if output_height is odd
+ jne .done
+
+ movu m3, [srcq + sstrideq] ;H
+ punpcklbw m6, m2, m3 ;G H
+ punpckhbw m2, m3 ;G H
+ pmaddubsw m0, k0k1
+ pmaddubsw m1, k0k1
+ pmaddubsw m4, k2k3
+ pmaddubsw m5, k2k3
+ pmaddubsw m8, k4k5
+ pmaddubsw m9, k4k5
+ pmaddubsw m6, k6k7
+ pmaddubsw m2, k6k7
+ paddsw m0, m8
+ paddsw m1, m9
+ paddsw m4, m6
+ paddsw m5, m2
+ paddsw m0, m4
+ paddsw m1, m5
+ paddsw m0, krd
+ paddsw m1, krd
+ psraw m0, 7
+ psraw m1, 7
+ packuswb m0, m1
+%ifidn %1, v8_avg
+ pavgb m0, [dstq]
+%endif
+ mova [dstq], m0
+
+.done:
+ REP_RET
+
+%endif ; AOM_ARCH_X86_64
+
+%endm
+
+INIT_XMM ssse3
+SUBPIX_VFILTER16 v8
+SUBPIX_VFILTER v8, 8
+SUBPIX_VFILTER v8, 4
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
new file mode 100644
index 0000000000..90dd55a4be
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm
@@ -0,0 +1,295 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm3, [rdx] ;load filters
+ pshuflw xmm4, xmm3, 11111111b ;k3
+ psrldq xmm3, 8
+ pshuflw xmm3, xmm3, 0b ;k4
+ punpcklqdq xmm4, xmm3 ;k3k4
+
+ movq xmm3, rcx ;rounding
+ pshufd xmm3, xmm3, 0
+
+ pxor xmm2, xmm2
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+
+ punpckldq xmm0, xmm1 ;two row in one register
+ punpcklbw xmm0, xmm2 ;unpack to word
+ pmullw xmm0, xmm4 ;multiply the filter factors
+
+ movdqa xmm1, xmm0
+ psrldq xmm1, 8
+ paddsw xmm0, xmm1
+
+ paddsw xmm0, xmm3 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm7, [rdx] ;load filters
+
+ pshuflw xmm6, xmm7, 11111111b ;k3
+ pshufhw xmm7, xmm7, 0b ;k4
+ punpcklwd xmm6, xmm6
+ punpckhwd xmm7, xmm7
+
+ movq xmm4, rcx ;rounding
+ pshufd xmm4, xmm4, 0
+
+ pxor xmm5, xmm5
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4 ;rounding
+ psraw xmm0, 7 ;shift
+ packuswb xmm0, xmm0 ;pack back to byte
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm5
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ punpckhbw xmm3, xmm5
+
+ pmullw xmm0, xmm6
+ pmullw xmm1, xmm7
+ pmullw xmm2, xmm6
+ pmullw xmm3, xmm7
+
+ paddsw xmm0, xmm1
+ paddsw xmm2, xmm3
+
+ paddsw xmm0, xmm4 ;rounding
+ paddsw xmm2, xmm4
+ psraw xmm0, 7 ;shift
+ psraw xmm2, 7
+ packuswb xmm0, xmm2 ;pack back to byte
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_sse2)
+sym(aom_filter_block1d4_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_v2_sse2)
+sym(aom_filter_block1d8_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_v2_sse2)
+sym(aom_filter_block1d16_v2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d4_h2_sse2)
+sym(aom_filter_block1d4_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_h2_sse2)
+sym(aom_filter_block1d8_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_h2_sse2)
+sym(aom_filter_block1d16_h2_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
new file mode 100644
index 0000000000..253bc26d38
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/aom_subpixel_bilinear_ssse3.asm
@@ -0,0 +1,267 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+%macro GET_PARAM_4 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm3, [rdx] ;load filters
+ psrldq xmm3, 6
+ packsswb xmm3, xmm3
+ pshuflw xmm3, xmm3, 0b ;k3_k4
+
+ movd xmm2, ecx ;rounding_shift
+ pshufd xmm2, xmm2, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_4 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm3
+
+ pmulhrsw xmm0, xmm2 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack to byte
+
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro GET_PARAM 0
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov ecx, 0x01000100
+
+ movdqa xmm7, [rdx] ;load filters
+ psrldq xmm7, 6
+ packsswb xmm7, xmm7
+ pshuflw xmm7, xmm7, 0b ;k3_k4
+ punpcklwd xmm7, xmm7
+
+ movd xmm6, ecx ;rounding_shift
+ pshufd xmm6, xmm6, 0
+
+ movsxd rax, DWORD PTR arg(1) ;pixels_per_line
+ movsxd rdx, DWORD PTR arg(3) ;out_pitch
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+%endm
+
+%macro APPLY_FILTER_8 1
+ punpcklbw xmm0, xmm1
+ pmaddubsw xmm0, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ packuswb xmm0, xmm0 ;pack back to byte
+
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movq [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+%macro APPLY_FILTER_16 1
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm2, xmm1
+ pmaddubsw xmm0, xmm7
+ pmaddubsw xmm2, xmm7
+
+ pmulhrsw xmm0, xmm6 ;rounding(+64)+shift(>>7)
+ pmulhrsw xmm2, xmm6
+ packuswb xmm0, xmm2 ;pack back to byte
+
+%if %1
+ movdqu xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movdqu [rdi], xmm0 ;store the result
+
+ lea rsi, [rsi + rax]
+ lea rdi, [rdi + rdx]
+ dec rcx
+%endm
+
+SECTION .text
+
+globalsym(aom_filter_block1d4_v2_ssse3)
+sym(aom_filter_block1d4_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movd xmm0, [rsi] ;load src
+ movd xmm1, [rsi + rax]
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_v2_ssse3)
+sym(aom_filter_block1d8_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movq xmm0, [rsi] ;0
+ movq xmm1, [rsi + rax] ;1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_v2_ssse3)
+sym(aom_filter_block1d16_v2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;0
+ movdqu xmm1, [rsi + rax] ;1
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d4_h2_ssse3)
+sym(aom_filter_block1d4_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM_4
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_4 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d8_h2_ssse3)
+sym(aom_filter_block1d8_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqa xmm1, xmm0
+ psrldq xmm1, 1
+
+ APPLY_FILTER_8 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+globalsym(aom_filter_block1d16_h2_ssse3)
+sym(aom_filter_block1d16_h2_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ ; end prolog
+
+ GET_PARAM
+.loop:
+ movdqu xmm0, [rsi] ;load src
+ movdqu xmm1, [rsi + 1]
+ movdqa xmm2, xmm0
+
+ APPLY_FILTER_16 0
+ jnz .loop
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
new file mode 100644
index 0000000000..49fcd72098
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_avx2.c
@@ -0,0 +1,897 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_avx2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_avx2(__m256i in, __m256i zero,
+ __m256i *out_lo,
+ __m256i *out_hi) {
+ const __m256i sign_bits = _mm256_cmpgt_epi16(zero, in);
+ *out_lo = _mm256_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm256_unpackhi_epi16(in, sign_bits);
+}
+
+static void hadamard_col8x2_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi16(a0, a1);
+ __m256i b1 = _mm256_sub_epi16(a0, a1);
+ __m256i b2 = _mm256_add_epi16(a2, a3);
+ __m256i b3 = _mm256_sub_epi16(a2, a3);
+ __m256i b4 = _mm256_add_epi16(a4, a5);
+ __m256i b5 = _mm256_sub_epi16(a4, a5);
+ __m256i b6 = _mm256_add_epi16(a6, a7);
+ __m256i b7 = _mm256_sub_epi16(a6, a7);
+
+ a0 = _mm256_add_epi16(b0, b2);
+ a1 = _mm256_add_epi16(b1, b3);
+ a2 = _mm256_sub_epi16(b0, b2);
+ a3 = _mm256_sub_epi16(b1, b3);
+ a4 = _mm256_add_epi16(b4, b6);
+ a5 = _mm256_add_epi16(b5, b7);
+ a6 = _mm256_sub_epi16(b4, b6);
+ a7 = _mm256_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi16(a0, a4);
+ b7 = _mm256_add_epi16(a1, a5);
+ b3 = _mm256_add_epi16(a2, a6);
+ b4 = _mm256_add_epi16(a3, a7);
+ b2 = _mm256_sub_epi16(a0, a4);
+ b6 = _mm256_sub_epi16(a1, a5);
+ b1 = _mm256_sub_epi16(a2, a6);
+ b5 = _mm256_sub_epi16(a3, a7);
+
+ a0 = _mm256_unpacklo_epi16(b0, b1);
+ a1 = _mm256_unpacklo_epi16(b2, b3);
+ a2 = _mm256_unpackhi_epi16(b0, b1);
+ a3 = _mm256_unpackhi_epi16(b2, b3);
+ a4 = _mm256_unpacklo_epi16(b4, b5);
+ a5 = _mm256_unpacklo_epi16(b6, b7);
+ a6 = _mm256_unpackhi_epi16(b4, b5);
+ a7 = _mm256_unpackhi_epi16(b6, b7);
+
+ b0 = _mm256_unpacklo_epi32(a0, a1);
+ b1 = _mm256_unpacklo_epi32(a4, a5);
+ b2 = _mm256_unpackhi_epi32(a0, a1);
+ b3 = _mm256_unpackhi_epi32(a4, a5);
+ b4 = _mm256_unpacklo_epi32(a2, a3);
+ b5 = _mm256_unpacklo_epi32(a6, a7);
+ b6 = _mm256_unpackhi_epi32(a2, a3);
+ b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm256_unpacklo_epi64(b0, b1);
+ in[1] = _mm256_unpackhi_epi64(b0, b1);
+ in[2] = _mm256_unpacklo_epi64(b2, b3);
+ in[3] = _mm256_unpackhi_epi64(b2, b3);
+ in[4] = _mm256_unpacklo_epi64(b4, b5);
+ in[5] = _mm256_unpackhi_epi64(b4, b5);
+ in[6] = _mm256_unpacklo_epi64(b6, b7);
+ in[7] = _mm256_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm256_add_epi16(a0, a4);
+ in[7] = _mm256_add_epi16(a1, a5);
+ in[3] = _mm256_add_epi16(a2, a6);
+ in[4] = _mm256_add_epi16(a3, a7);
+ in[2] = _mm256_sub_epi16(a0, a4);
+ in[6] = _mm256_sub_epi16(a1, a5);
+ in[1] = _mm256_sub_epi16(a2, a6);
+ in[5] = _mm256_sub_epi16(a3, a7);
+ }
+}
+
+void aom_hadamard_lp_8x8_dual_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ __m256i src[8];
+ src[0] = _mm256_loadu_si256((const __m256i *)src_diff);
+ src[1] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[3] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[4] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[5] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[6] = _mm256_loadu_si256((const __m256i *)(src_diff += src_stride));
+ src[7] = _mm256_loadu_si256((const __m256i *)(src_diff + src_stride));
+
+ hadamard_col8x2_avx2(src, 0);
+ hadamard_col8x2_avx2(src, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x20));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[0], src[1], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[2], src[3], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[4], src[5], 0x31));
+ coeff += 16;
+ _mm256_storeu_si256((__m256i *)coeff,
+ _mm256_permute2x128_si256(src[6], src[7], 0x31));
+}
+
+static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+ t_coeff + (idx * 64 * 2));
+ }
+
+ for (idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ if (is_final) {
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
+ coeff += 16;
+ } else {
+ _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
+ coeff16 += 16;
+ }
+ t_coeff += 16;
+ }
+}
+
+void aom_hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_avx2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_lp_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ int16_t *t_coeff = coeff;
+ for (int idx = 0; idx < 2; ++idx) {
+ const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
+ aom_hadamard_lp_8x8_dual_avx2(src_ptr, src_stride,
+ t_coeff + (idx * 64 * 2));
+ }
+
+ for (int idx = 0; idx < 64; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi16(b0, 1);
+ b1 = _mm256_srai_epi16(b1, 1);
+ b2 = _mm256_srai_epi16(b2, 1);
+ b3 = _mm256_srai_epi16(b3, 1);
+ _mm256_storeu_si256((__m256i *)coeff, _mm256_add_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff + 64), _mm256_add_epi16(b1, b3));
+ _mm256_storeu_si256((__m256i *)(coeff + 128), _mm256_sub_epi16(b0, b2));
+ _mm256_storeu_si256((__m256i *)(coeff + 192), _mm256_sub_epi16(b1, b3));
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+void aom_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+ int idx;
+ __m256i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m256i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m256i b0, b1, b2, b3;
+ const __m256i zero = _mm256_setzero_si256();
+ for (idx = 0; idx < 4; ++idx) {
+ // src_diff: 9 bit, dynamic range [-255, 255]
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_avx2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 16) {
+ const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_avx2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_avx2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm256_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm256_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm256_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm256_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm256_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm256_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm256_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm256_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm256_srai_epi32(b0_lo, 2);
+ b1_lo = _mm256_srai_epi32(b1_lo, 2);
+ b2_lo = _mm256_srai_epi32(b2_lo, 2);
+ b3_lo = _mm256_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm256_srai_epi32(b0_hi, 2);
+ b1_hi = _mm256_srai_epi32(b1_hi, 2);
+ b2_hi = _mm256_srai_epi32(b2_hi, 2);
+ b3_hi = _mm256_srai_epi32(b3_hi, 2);
+
+ b0 = _mm256_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm256_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm256_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm256_packs_epi32(b3_lo, b3_hi);
+
+ store_tran_low(_mm256_add_epi16(b0, b2), coeff);
+ store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
+ store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
+ store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);
+
+ coeff += 16;
+ t_coeff += 16;
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static void highbd_hadamard_col8_avx2(__m256i *in, int iter) {
+ __m256i a0 = in[0];
+ __m256i a1 = in[1];
+ __m256i a2 = in[2];
+ __m256i a3 = in[3];
+ __m256i a4 = in[4];
+ __m256i a5 = in[5];
+ __m256i a6 = in[6];
+ __m256i a7 = in[7];
+
+ __m256i b0 = _mm256_add_epi32(a0, a1);
+ __m256i b1 = _mm256_sub_epi32(a0, a1);
+ __m256i b2 = _mm256_add_epi32(a2, a3);
+ __m256i b3 = _mm256_sub_epi32(a2, a3);
+ __m256i b4 = _mm256_add_epi32(a4, a5);
+ __m256i b5 = _mm256_sub_epi32(a4, a5);
+ __m256i b6 = _mm256_add_epi32(a6, a7);
+ __m256i b7 = _mm256_sub_epi32(a6, a7);
+
+ a0 = _mm256_add_epi32(b0, b2);
+ a1 = _mm256_add_epi32(b1, b3);
+ a2 = _mm256_sub_epi32(b0, b2);
+ a3 = _mm256_sub_epi32(b1, b3);
+ a4 = _mm256_add_epi32(b4, b6);
+ a5 = _mm256_add_epi32(b5, b7);
+ a6 = _mm256_sub_epi32(b4, b6);
+ a7 = _mm256_sub_epi32(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm256_add_epi32(a0, a4);
+ b7 = _mm256_add_epi32(a1, a5);
+ b3 = _mm256_add_epi32(a2, a6);
+ b4 = _mm256_add_epi32(a3, a7);
+ b2 = _mm256_sub_epi32(a0, a4);
+ b6 = _mm256_sub_epi32(a1, a5);
+ b1 = _mm256_sub_epi32(a2, a6);
+ b5 = _mm256_sub_epi32(a3, a7);
+
+ a0 = _mm256_unpacklo_epi32(b0, b1);
+ a1 = _mm256_unpacklo_epi32(b2, b3);
+ a2 = _mm256_unpackhi_epi32(b0, b1);
+ a3 = _mm256_unpackhi_epi32(b2, b3);
+ a4 = _mm256_unpacklo_epi32(b4, b5);
+ a5 = _mm256_unpacklo_epi32(b6, b7);
+ a6 = _mm256_unpackhi_epi32(b4, b5);
+ a7 = _mm256_unpackhi_epi32(b6, b7);
+
+ b0 = _mm256_unpacklo_epi64(a0, a1);
+ b1 = _mm256_unpacklo_epi64(a4, a5);
+ b2 = _mm256_unpackhi_epi64(a0, a1);
+ b3 = _mm256_unpackhi_epi64(a4, a5);
+ b4 = _mm256_unpacklo_epi64(a2, a3);
+ b5 = _mm256_unpacklo_epi64(a6, a7);
+ b6 = _mm256_unpackhi_epi64(a2, a3);
+ b7 = _mm256_unpackhi_epi64(a6, a7);
+
+ in[0] = _mm256_permute2x128_si256(b0, b1, 0x20);
+ in[1] = _mm256_permute2x128_si256(b0, b1, 0x31);
+ in[2] = _mm256_permute2x128_si256(b2, b3, 0x20);
+ in[3] = _mm256_permute2x128_si256(b2, b3, 0x31);
+ in[4] = _mm256_permute2x128_si256(b4, b5, 0x20);
+ in[5] = _mm256_permute2x128_si256(b4, b5, 0x31);
+ in[6] = _mm256_permute2x128_si256(b6, b7, 0x20);
+ in[7] = _mm256_permute2x128_si256(b6, b7, 0x31);
+ } else {
+ in[0] = _mm256_add_epi32(a0, a4);
+ in[7] = _mm256_add_epi32(a1, a5);
+ in[3] = _mm256_add_epi32(a2, a6);
+ in[4] = _mm256_add_epi32(a3, a7);
+ in[2] = _mm256_sub_epi32(a0, a4);
+ in[6] = _mm256_sub_epi32(a1, a5);
+ in[1] = _mm256_sub_epi32(a2, a6);
+ in[5] = _mm256_sub_epi32(a3, a7);
+ }
+}
+
+void aom_highbd_hadamard_8x8_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src16[8];
+ __m256i src32[8];
+
+ src16[0] = _mm_loadu_si128((const __m128i *)src_diff);
+ src16[1] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[2] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[3] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[4] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[5] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[6] = _mm_loadu_si128((const __m128i *)(src_diff += src_stride));
+ src16[7] = _mm_loadu_si128((const __m128i *)(src_diff + src_stride));
+
+ src32[0] = _mm256_cvtepi16_epi32(src16[0]);
+ src32[1] = _mm256_cvtepi16_epi32(src16[1]);
+ src32[2] = _mm256_cvtepi16_epi32(src16[2]);
+ src32[3] = _mm256_cvtepi16_epi32(src16[3]);
+ src32[4] = _mm256_cvtepi16_epi32(src16[4]);
+ src32[5] = _mm256_cvtepi16_epi32(src16[5]);
+ src32[6] = _mm256_cvtepi16_epi32(src16[6]);
+ src32[7] = _mm256_cvtepi16_epi32(src16[7]);
+
+ highbd_hadamard_col8_avx2(src32, 0);
+ highbd_hadamard_col8_avx2(src32, 1);
+
+ _mm256_storeu_si256((__m256i *)coeff, src32[0]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[1]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[2]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[3]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[4]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[5]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[6]);
+ coeff += 8;
+ _mm256_storeu_si256((__m256i *)coeff, src32[7]);
+}
+
+void aom_highbd_hadamard_16x16_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ aom_highbd_hadamard_8x8_avx2(src_ptr, src_stride, t_coeff + idx * 64);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 1);
+ b1 = _mm256_srai_epi32(b1, 1);
+ b2 = _mm256_srai_epi32(b2, 1);
+ b3 = _mm256_srai_epi32(b3, 1);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 64), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 128), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 192), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+
+void aom_highbd_hadamard_32x32_avx2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff) {
+ int idx;
+ tran_low_t *t_coeff = coeff;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ aom_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
+ __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
+ __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
+ __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));
+
+ __m256i b0 = _mm256_add_epi32(coeff0, coeff1);
+ __m256i b1 = _mm256_sub_epi32(coeff0, coeff1);
+ __m256i b2 = _mm256_add_epi32(coeff2, coeff3);
+ __m256i b3 = _mm256_sub_epi32(coeff2, coeff3);
+
+ b0 = _mm256_srai_epi32(b0, 2);
+ b1 = _mm256_srai_epi32(b1, 2);
+ b2 = _mm256_srai_epi32(b2, 2);
+ b3 = _mm256_srai_epi32(b3, 2);
+
+ coeff0 = _mm256_add_epi32(b0, b2);
+ coeff1 = _mm256_add_epi32(b1, b3);
+ coeff2 = _mm256_sub_epi32(b0, b2);
+ coeff3 = _mm256_sub_epi32(b1, b3);
+
+ _mm256_storeu_si256((__m256i *)coeff, coeff0);
+ _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1);
+ _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2);
+ _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3);
+
+ coeff += 8;
+ t_coeff += 8;
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+int aom_satd_avx2(const tran_low_t *coeff, int length) {
+ __m256i accum = _mm256_setzero_si256();
+ int i;
+
+ for (i = 0; i < length; i += 8, coeff += 8) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi32(src_line);
+ accum = _mm256_add_epi32(accum, abs);
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+int aom_satd_lp_avx2(const int16_t *coeff, int length) {
+ const __m256i one = _mm256_set1_epi16(1);
+ __m256i accum = _mm256_setzero_si256();
+
+ for (int i = 0; i < length; i += 16) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i abs = _mm256_abs_epi16(src_line);
+ const __m256i sum = _mm256_madd_epi16(abs, one);
+ accum = _mm256_add_epi32(accum, sum);
+ coeff += 16;
+ }
+
+ { // 32 bit horizontal add
+ const __m256i a = _mm256_srli_si256(accum, 8);
+ const __m256i b = _mm256_add_epi32(accum, a);
+ const __m256i c = _mm256_srli_epi64(b, 32);
+ const __m256i d = _mm256_add_epi32(b, c);
+ const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
+ _mm256_extractf128_si256(d, 1));
+ return _mm_cvtsi128_si32(accum_128);
+ }
+}
+
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+ __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+ a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+ return a;
+}
+
+void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+ const uint8_t *s_y1 = s_y0 + 8 * p;
+ __m256i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm256_setzero_si256();
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+ sum0 = _mm256_add_epi16(s0, s1);
+ sum1 = _mm256_add_epi16(s2, s3);
+ s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+ s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+ s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+ s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+ sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+ sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+ sum0 = _mm256_add_epi16(sum0, sum1);
+
+ // (avg + 32) >> 6
+ __m256i rounding = _mm256_set1_epi32(32);
+ sum0 = _mm256_add_epi32(sum0, rounding);
+ sum0 = _mm256_srli_epi32(sum0, 6);
+ __m128i lo = _mm256_castsi256_si128(sum0);
+ __m128i hi = _mm256_extracti128_si256(sum0, 1);
+ avg[0] = _mm_cvtsi128_si32(lo);
+ avg[1] = _mm_extract_epi32(lo, 2);
+ avg[2] = _mm_cvtsi128_si32(hi);
+ avg[3] = _mm_extract_epi32(hi, 2);
+}
+
+void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
+
+ if (width % 32 == 0) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int wd = 0; wd < width; wd += 32) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m256i s0 = zero;
+ __m256i s1 = zero;
+ int idx = 0;
+ do {
+ __m256i src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ __m256i t0 = _mm256_unpacklo_epi8(src_line, zero);
+ __m256i t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_add_epi16(s0, t0);
+ s1 = _mm256_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+
+ src_line = _mm256_loadu_si256((const __m256i *)ref_tmp);
+ t0 = _mm256_unpacklo_epi8(src_line, zero);
+ t1 = _mm256_unpackhi_epi8(src_line, zero);
+ s0 = _mm256_add_epi16(s0, t0);
+ s1 = _mm256_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+ s0 = _mm256_srai_epi16(s0, norm_factor);
+ s1 = _mm256_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), _mm256_castsi256_si128(s0));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), _mm256_castsi256_si128(s1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 16),
+ _mm256_extractf128_si256(s0, 1));
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 24),
+ _mm256_extractf128_si256(s1, 1));
+ }
+ } else if (width % 16 == 0) {
+ aom_int_pro_row_sse2(hbuf, ref, ref_stride, width, height, norm_factor);
+ }
+}
+
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+ const int stride) {
+ src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+ src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+ src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+ src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE \
+ /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */ \
+ const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0); \
+ /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */ \
+ const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01); \
+ /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */ \
+ const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012); \
+ \
+ const __m128i results0 = _mm_add_epi16( \
+ _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+ const __m128i results1 = \
+ _mm_add_epi16(results0, _mm_srli_si128(results0, 8)); \
+ _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride,
+ const int height,
+ int norm_factor) {
+ const __m256i zero = _mm256_setzero_si256();
+ int ht = 0;
+ // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+ // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+ // utilize the higher 16-bits efficiently.
+ do {
+ __m256i src_00 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+ src_00 = _mm256_inserti128_si256(
+ src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+ __m256i src_01 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+ src_01 = _mm256_inserti128_si256(
+ src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+ __m256i src_10 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+ src_10 = _mm256_inserti128_si256(
+ src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+ __m256i src_11 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+ src_11 = _mm256_inserti128_si256(
+ src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+ // s00 x x x s01 x x x | s40 x x x s41 x x x
+ const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+ // s10 x x x s11 x x x | s50 x x x s51 x x x
+ const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+ // s20 x x x s21 x x x | s60 x x x s61 x x x
+ const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+ // s30 x x x s31 x x x | s70 x x x s71 x x x
+ const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+ // s00 s10 x x x x x x | s40 s50 x x x x x x
+ const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+ // s01 s11 x x x x x x | s41 s51 x x x x x x
+ const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+ // s20 s30 x x x x x x | s60 s70 x x x x x x
+ const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+ // s21 s31 x x x x x x | s61 s71 x x x x x x
+ const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+ // s0 s1 x x x x x x | s4 s5 x x x x x x
+ const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+ // s2 s3 x x x x x x | s6 s7 x x x x x x
+ const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+ // s1 s1 s2 s3 s4 s5 s6 s7
+ const __m128i results = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+ _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+ vbuf += 8;
+ ref += (ref_stride << 3);
+ ht += 8;
+ } while (ht < height);
+}
+
+void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ assert(width % 16 == 0);
+ if (width == 128) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[16];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
+ load_from_src_buf(ref + 64, &src[8], ref_stride);
+ load_from_src_buf(ref + 96, &src[12], ref_stride);
+
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+ _mm256_sad_epu8(src[4], zero));
+ const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+ _mm256_sad_epu8(src[12], zero));
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+ _mm256_sad_epu8(src[5], zero));
+ const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+ _mm256_sad_epu8(src[13], zero));
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+ _mm256_sad_epu8(src[6], zero));
+ const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+ _mm256_sad_epu8(src[14], zero));
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+ _mm256_sad_epu8(src[7], zero));
+ const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+ _mm256_sad_epu8(src[15], zero));
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 64) {
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[8];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+ load_from_src_buf(ref + 32, &src[4], ref_stride);
+
+ // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+ const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+ const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+ const __m256i r0 = _mm256_add_epi16(s0, s1);
+ // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+ const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+ const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+ const __m256i r1 = _mm256_add_epi16(s2, s3);
+ // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+ const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+ const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+ const __m256i r2 = _mm256_add_epi16(s4, s5);
+ // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+ const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+ const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+ const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 32) {
+ assert(height % 2 == 0);
+ const __m256i zero = _mm256_setzero_si256();
+ for (int ht = 0; ht < height; ht += 4) {
+ __m256i src[4];
+ // Load source data.
+ load_from_src_buf(ref, &src[0], ref_stride);
+
+ // s00 x x x s01 x x x s02 x x x s03 x x x
+ const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+ // s10 x x x s11 x x x s12 x x x s13 x x x
+ const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+ // s20 x x x s21 x x x s22 x x x s23 x x x
+ const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+ // s30 x x x s31 x x x s32 x x x s33 x x x
+ const __m256i r3 = _mm256_sad_epu8(src[3], zero);
+
+ CALC_TOT_SAD_AND_STORE
+ vbuf += 4;
+ ref += ref_stride << 2;
+ }
+ } else if (width == 16) {
+ aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
+ }
+}
+
+static inline void calc_vector_mean_sse_64wd(const int16_t *ref,
+ const int16_t *src, __m256i *mean,
+ __m256i *sse) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(src + 32));
+ const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(src + 48));
+ const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+ const __m256i ref_line2 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i ref_line3 = _mm256_loadu_si256((const __m256i *)(ref + 48));
+
+ const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+ const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+ const __m256i diff2 = _mm256_sub_epi16(ref_line2, src_line2);
+ const __m256i diff3 = _mm256_sub_epi16(ref_line3, src_line3);
+ const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+ const __m256i diff_sqr2 = _mm256_madd_epi16(diff2, diff2);
+ const __m256i diff_sqr3 = _mm256_madd_epi16(diff3, diff3);
+
+ *mean = _mm256_add_epi16(*mean, _mm256_add_epi16(diff0, diff1));
+ *mean = _mm256_add_epi16(*mean, diff2);
+ *mean = _mm256_add_epi16(*mean, diff3);
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(diff_sqr0, diff_sqr1));
+ *sse = _mm256_add_epi32(*sse, diff_sqr2);
+ *sse = _mm256_add_epi32(*sse, diff_sqr3);
+}
+
+#define CALC_VAR_FROM_MEAN_SSE(mean, sse) \
+ { \
+ mean = _mm256_madd_epi16(mean, _mm256_set1_epi16(1)); \
+ mean = _mm256_hadd_epi32(mean, sse); \
+ mean = _mm256_add_epi32(mean, _mm256_bsrli_epi128(mean, 4)); \
+ const __m128i result = _mm_add_epi32(_mm256_castsi256_si128(mean), \
+ _mm256_extractf128_si256(mean, 1)); \
+ /*(mean * mean): dynamic range 31 bits.*/ \
+ const int mean_int = _mm_extract_epi32(result, 0); \
+ const int sse_int = _mm_extract_epi32(result, 2); \
+ const unsigned int mean_abs = abs(mean_int); \
+ var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2)); \
+ }
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_avx2(const int16_t *ref, const int16_t *src, int bwl) {
+ const int width = 4 << bwl;
+ assert(width % 16 == 0 && width <= 128);
+ int var = 0;
+
+ // Instead of having a loop over width 16, considered loop unrolling to avoid
+ // some addition operations.
+ if (width == 128) {
+ __m256i mean = _mm256_setzero_si256();
+ __m256i sse = _mm256_setzero_si256();
+
+ calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+ calc_vector_mean_sse_64wd(src + 64, ref + 64, &mean, &sse);
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 64) {
+ __m256i mean = _mm256_setzero_si256();
+ __m256i sse = _mm256_setzero_si256();
+
+ calc_vector_mean_sse_64wd(src, ref, &mean, &sse);
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 32) {
+ const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i ref_line0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(src + 16));
+ const __m256i ref_line1 = _mm256_loadu_si256((const __m256i *)(ref + 16));
+
+ const __m256i diff0 = _mm256_sub_epi16(ref_line0, src_line0);
+ const __m256i diff1 = _mm256_sub_epi16(ref_line1, src_line1);
+ const __m256i diff_sqr0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i diff_sqr1 = _mm256_madd_epi16(diff1, diff1);
+ const __m256i sse = _mm256_add_epi32(diff_sqr0, diff_sqr1);
+ __m256i mean = _mm256_add_epi16(diff0, diff1);
+
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ } else if (width == 16) {
+ const __m256i src_line = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i ref_line = _mm256_loadu_si256((const __m256i *)ref);
+ __m256i mean = _mm256_sub_epi16(ref_line, src_line);
+ const __m256i sse = _mm256_madd_epi16(mean, mean);
+
+ CALC_VAR_FROM_MEAN_SSE(mean, sse)
+ }
+ return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
new file mode 100644
index 0000000000..9ab9143eee
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse2.c
@@ -0,0 +1,700 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_ports/mem.h"
+
+static INLINE void sign_extend_16bit_to_32bit_sse2(__m128i in, __m128i zero,
+ __m128i *out_lo,
+ __m128i *out_hi) {
+ const __m128i sign_bits = _mm_cmplt_epi16(in, zero);
+ *out_lo = _mm_unpacklo_epi16(in, sign_bits);
+ *out_hi = _mm_unpackhi_epi16(in, sign_bits);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
+void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+ int *min, int *max) {
+ __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+ u0 = _mm_setzero_si128();
+ // Row 0
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff0 = _mm_max_epi16(diff, negdiff);
+ // Row 1
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+ minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+ // Row 2
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 3
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 4
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 5
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 6
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+ // Row 7
+ s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+ d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+ diff = _mm_subs_epi16(s0, d0);
+ negdiff = _mm_subs_epi16(u0, diff);
+ absdiff = _mm_max_epi16(diff, negdiff);
+ maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+ minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+ maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+ *max = _mm_extract_epi16(maxabsdiff, 0);
+
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+ minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+ *min = _mm_extract_epi16(minabsdiff, 0);
+}
+
+unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = loadh_epi64((const __m128i *)(s + p),
+ _mm_loadl_epi64((const __m128i *)(s)));
+ s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+ s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+ s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+ _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s2 = _mm_sad_epu8(s2, u0);
+ s3 = _mm_sad_epu8(s3, u0);
+
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ sum0 = _mm_add_epi16(sum0, sum1);
+ sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+ avg = _mm_cvtsi128_si32(sum0);
+ return (avg + 32) >> 6;
+}
+
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+ __m128i sum0, sum1, s0, s1, s2, s3, u0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+ sum0 = _mm_add_epi16(s0, s1);
+ sum1 = _mm_add_epi16(s2, s3);
+ s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+ s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+ s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+ s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+ sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+ sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+ sum0 = _mm_add_epi16(sum0, sum1);
+
+ // (avg + 32) >> 6
+ __m128i rounding = _mm_set1_epi32(32);
+ sum0 = _mm_add_epi32(sum0, rounding);
+ sum0 = _mm_srli_epi32(sum0, 6);
+ avg[0] = _mm_cvtsi128_si32(sum0);
+ avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
+void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
+ int *avg) {
+ const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+ for (int k = 0; k < 2; k++) {
+ calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+ s_ptr += 8 * p;
+ }
+}
+
+unsigned int aom_avg_4x4_sse2(const uint8_t *s, int p) {
+ __m128i s0, s1, u0;
+ unsigned int avg = 0;
+ u0 = _mm_setzero_si128();
+ s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+ _mm_cvtsi32_si128(*(const int *)(s + p)));
+ s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+ _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+ s0 = _mm_sad_epu8(s0, u0);
+ s1 = _mm_sad_epu8(s1, u0);
+ s0 = _mm_add_epi16(s0, s1);
+ avg = _mm_cvtsi128_si32(s0);
+ return (avg + 8) >> 4;
+}
+
+static INLINE void hadamard_col4_sse2(__m128i *in, int iter) {
+ const __m128i a0 = in[0];
+ const __m128i a1 = in[1];
+ const __m128i a2 = in[2];
+ const __m128i a3 = in[3];
+ const __m128i b0 = _mm_srai_epi16(_mm_add_epi16(a0, a1), 1);
+ const __m128i b1 = _mm_srai_epi16(_mm_sub_epi16(a0, a1), 1);
+ const __m128i b2 = _mm_srai_epi16(_mm_add_epi16(a2, a3), 1);
+ const __m128i b3 = _mm_srai_epi16(_mm_sub_epi16(a2, a3), 1);
+ in[0] = _mm_add_epi16(b0, b2);
+ in[1] = _mm_add_epi16(b1, b3);
+ in[2] = _mm_sub_epi16(b0, b2);
+ in[3] = _mm_sub_epi16(b1, b3);
+
+ if (iter == 0) {
+ const __m128i ba = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i dc = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i dcba_lo = _mm_unpacklo_epi32(ba, dc);
+ const __m128i dcba_hi = _mm_unpackhi_epi32(ba, dc);
+ in[0] = dcba_lo;
+ in[1] = _mm_srli_si128(dcba_lo, 8);
+ in[2] = dcba_hi;
+ in[3] = _mm_srli_si128(dcba_hi, 8);
+ }
+}
+
+void aom_hadamard_4x4_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ __m128i src[4];
+ src[0] = _mm_loadl_epi64((const __m128i *)src_diff);
+ src[1] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_loadl_epi64((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_loadl_epi64((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col4_sse2(src, 0);
+ hadamard_col4_sse2(src, 1);
+
+ store_tran_low(_mm_unpacklo_epi64(src[0], src[1]), coeff);
+ coeff += 8;
+ store_tran_low(_mm_unpacklo_epi64(src[2], src[3]), coeff);
+}
+
+static INLINE void hadamard_col8_sse2(__m128i *in, int iter) {
+ __m128i a0 = in[0];
+ __m128i a1 = in[1];
+ __m128i a2 = in[2];
+ __m128i a3 = in[3];
+ __m128i a4 = in[4];
+ __m128i a5 = in[5];
+ __m128i a6 = in[6];
+ __m128i a7 = in[7];
+
+ __m128i b0 = _mm_add_epi16(a0, a1);
+ __m128i b1 = _mm_sub_epi16(a0, a1);
+ __m128i b2 = _mm_add_epi16(a2, a3);
+ __m128i b3 = _mm_sub_epi16(a2, a3);
+ __m128i b4 = _mm_add_epi16(a4, a5);
+ __m128i b5 = _mm_sub_epi16(a4, a5);
+ __m128i b6 = _mm_add_epi16(a6, a7);
+ __m128i b7 = _mm_sub_epi16(a6, a7);
+
+ a0 = _mm_add_epi16(b0, b2);
+ a1 = _mm_add_epi16(b1, b3);
+ a2 = _mm_sub_epi16(b0, b2);
+ a3 = _mm_sub_epi16(b1, b3);
+ a4 = _mm_add_epi16(b4, b6);
+ a5 = _mm_add_epi16(b5, b7);
+ a6 = _mm_sub_epi16(b4, b6);
+ a7 = _mm_sub_epi16(b5, b7);
+
+ if (iter == 0) {
+ b0 = _mm_add_epi16(a0, a4);
+ b7 = _mm_add_epi16(a1, a5);
+ b3 = _mm_add_epi16(a2, a6);
+ b4 = _mm_add_epi16(a3, a7);
+ b2 = _mm_sub_epi16(a0, a4);
+ b6 = _mm_sub_epi16(a1, a5);
+ b1 = _mm_sub_epi16(a2, a6);
+ b5 = _mm_sub_epi16(a3, a7);
+
+ a0 = _mm_unpacklo_epi16(b0, b1);
+ a1 = _mm_unpacklo_epi16(b2, b3);
+ a2 = _mm_unpackhi_epi16(b0, b1);
+ a3 = _mm_unpackhi_epi16(b2, b3);
+ a4 = _mm_unpacklo_epi16(b4, b5);
+ a5 = _mm_unpacklo_epi16(b6, b7);
+ a6 = _mm_unpackhi_epi16(b4, b5);
+ a7 = _mm_unpackhi_epi16(b6, b7);
+
+ b0 = _mm_unpacklo_epi32(a0, a1);
+ b1 = _mm_unpacklo_epi32(a4, a5);
+ b2 = _mm_unpackhi_epi32(a0, a1);
+ b3 = _mm_unpackhi_epi32(a4, a5);
+ b4 = _mm_unpacklo_epi32(a2, a3);
+ b5 = _mm_unpacklo_epi32(a6, a7);
+ b6 = _mm_unpackhi_epi32(a2, a3);
+ b7 = _mm_unpackhi_epi32(a6, a7);
+
+ in[0] = _mm_unpacklo_epi64(b0, b1);
+ in[1] = _mm_unpackhi_epi64(b0, b1);
+ in[2] = _mm_unpacklo_epi64(b2, b3);
+ in[3] = _mm_unpackhi_epi64(b2, b3);
+ in[4] = _mm_unpacklo_epi64(b4, b5);
+ in[5] = _mm_unpackhi_epi64(b4, b5);
+ in[6] = _mm_unpacklo_epi64(b6, b7);
+ in[7] = _mm_unpackhi_epi64(b6, b7);
+ } else {
+ in[0] = _mm_add_epi16(a0, a4);
+ in[7] = _mm_add_epi16(a1, a5);
+ in[3] = _mm_add_epi16(a2, a6);
+ in[4] = _mm_add_epi16(a3, a7);
+ in[2] = _mm_sub_epi16(a0, a4);
+ in[6] = _mm_sub_epi16(a1, a5);
+ in[1] = _mm_sub_epi16(a2, a6);
+ in[5] = _mm_sub_epi16(a3, a7);
+ }
+}
+
+static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ if (is_final) {
+ store_tran_low(src[0], coeff);
+ coeff += 8;
+ store_tran_low(src[1], coeff);
+ coeff += 8;
+ store_tran_low(src[2], coeff);
+ coeff += 8;
+ store_tran_low(src[3], coeff);
+ coeff += 8;
+ store_tran_low(src[4], coeff);
+ coeff += 8;
+ store_tran_low(src[5], coeff);
+ coeff += 8;
+ store_tran_low(src[6], coeff);
+ coeff += 8;
+ store_tran_low(src[7], coeff);
+ } else {
+ int16_t *coeff16 = (int16_t *)coeff;
+ _mm_store_si128((__m128i *)coeff16, src[0]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[1]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[2]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[3]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[4]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[5]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[6]);
+ coeff16 += 8;
+ _mm_store_si128((__m128i *)coeff16, src[7]);
+ }
+}
+
+void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
+}
+
+static INLINE void hadamard_lp_8x8_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ __m128i src[8];
+ src[0] = _mm_load_si128((const __m128i *)src_diff);
+ src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+ src[7] = _mm_load_si128((const __m128i *)(src_diff + src_stride));
+
+ hadamard_col8_sse2(src, 0);
+ hadamard_col8_sse2(src, 1);
+
+ _mm_store_si128((__m128i *)coeff, src[0]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[1]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[2]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[3]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[4]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[5]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[6]);
+ coeff += 8;
+ _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void aom_hadamard_lp_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ hadamard_lp_8x8_sse2(src_diff, src_stride, coeff);
+}
+
+void aom_hadamard_lp_8x8_dual_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, int16_t *coeff) {
+ for (int i = 0; i < 2; i++) {
+ hadamard_lp_8x8_sse2(src_diff + (i * 8), src_stride, coeff + (i * 64));
+ }
+}
+
+void aom_hadamard_lp_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ int16_t *coeff) {
+ for (int idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_lp_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+ }
+
+ int16_t *t_coeff = coeff;
+ for (int idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ _mm_store_si128((__m128i *)t_coeff, coeff0);
+ _mm_store_si128((__m128i *)(t_coeff + 64), coeff1);
+ _mm_store_si128((__m128i *)(t_coeff + 128), coeff2);
+ _mm_store_si128((__m128i *)(t_coeff + 192), coeff3);
+
+ t_coeff += 8;
+ }
+}
+
+static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
+ ptrdiff_t src_stride, tran_low_t *coeff,
+ int is_final) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
+ int16_t *t_coeff = temp_coeff;
+ int16_t *coeff16 = (int16_t *)coeff;
+ int idx;
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
+ hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
+ 0);
+ }
+
+ for (idx = 0; idx < 64; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
+
+ __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+ __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+ __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+ __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+ b0 = _mm_srai_epi16(b0, 1);
+ b1 = _mm_srai_epi16(b1, 1);
+ b2 = _mm_srai_epi16(b2, 1);
+ b3 = _mm_srai_epi16(b3, 1);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+
+ if (is_final) {
+ store_tran_low_offset_4(coeff0, coeff);
+ store_tran_low_offset_4(coeff1, coeff + 64);
+ store_tran_low_offset_4(coeff2, coeff + 128);
+ store_tran_low_offset_4(coeff3, coeff + 192);
+ coeff += 4;
+ } else {
+ _mm_store_si128((__m128i *)coeff16, coeff0);
+ _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
+ _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
+ _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
+ coeff16 += 8;
+ }
+
+ t_coeff += 8;
+ // Increment the pointer additionally by 0 and 8 in alternate
+ // iterations(instead of 8) to ensure the coherency with the implementation
+ // of store_tran_low_offset_4()
+ coeff += (((idx >> 3) & 1) << 3);
+ }
+}
+
+void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
+}
+
+void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
+ tran_low_t *coeff) {
+ // For high bitdepths, it is unnecessary to store_tran_low
+ // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
+ // next stage. Output to an intermediate buffer first, then store_tran_low()
+ // in the final stage.
+ DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
+ int16_t *t_coeff = temp_coeff;
+ int idx;
+ __m128i coeff0_lo, coeff1_lo, coeff2_lo, coeff3_lo, b0_lo, b1_lo, b2_lo,
+ b3_lo;
+ __m128i coeff0_hi, coeff1_hi, coeff2_hi, coeff3_hi, b0_hi, b1_hi, b2_hi,
+ b3_hi;
+ __m128i b0, b1, b2, b3;
+ const __m128i zero = _mm_setzero_si128();
+ for (idx = 0; idx < 4; ++idx) {
+ const int16_t *src_ptr =
+ src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
+ hadamard_16x16_sse2(src_ptr, src_stride,
+ (tran_low_t *)(t_coeff + idx * 256), 0);
+ }
+
+ for (idx = 0; idx < 256; idx += 8) {
+ __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
+ __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
+ __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
+ __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
+
+ // Sign extend 16 bit to 32 bit.
+ sign_extend_16bit_to_32bit_sse2(coeff0, zero, &coeff0_lo, &coeff0_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff1, zero, &coeff1_lo, &coeff1_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff2, zero, &coeff2_lo, &coeff2_hi);
+ sign_extend_16bit_to_32bit_sse2(coeff3, zero, &coeff3_lo, &coeff3_hi);
+
+ b0_lo = _mm_add_epi32(coeff0_lo, coeff1_lo);
+ b0_hi = _mm_add_epi32(coeff0_hi, coeff1_hi);
+
+ b1_lo = _mm_sub_epi32(coeff0_lo, coeff1_lo);
+ b1_hi = _mm_sub_epi32(coeff0_hi, coeff1_hi);
+
+ b2_lo = _mm_add_epi32(coeff2_lo, coeff3_lo);
+ b2_hi = _mm_add_epi32(coeff2_hi, coeff3_hi);
+
+ b3_lo = _mm_sub_epi32(coeff2_lo, coeff3_lo);
+ b3_hi = _mm_sub_epi32(coeff2_hi, coeff3_hi);
+
+ b0_lo = _mm_srai_epi32(b0_lo, 2);
+ b1_lo = _mm_srai_epi32(b1_lo, 2);
+ b2_lo = _mm_srai_epi32(b2_lo, 2);
+ b3_lo = _mm_srai_epi32(b3_lo, 2);
+
+ b0_hi = _mm_srai_epi32(b0_hi, 2);
+ b1_hi = _mm_srai_epi32(b1_hi, 2);
+ b2_hi = _mm_srai_epi32(b2_hi, 2);
+ b3_hi = _mm_srai_epi32(b3_hi, 2);
+
+ b0 = _mm_packs_epi32(b0_lo, b0_hi);
+ b1 = _mm_packs_epi32(b1_lo, b1_hi);
+ b2 = _mm_packs_epi32(b2_lo, b2_hi);
+ b3 = _mm_packs_epi32(b3_lo, b3_hi);
+
+ coeff0 = _mm_add_epi16(b0, b2);
+ coeff1 = _mm_add_epi16(b1, b3);
+ store_tran_low_offset_4(coeff0, coeff);
+ store_tran_low_offset_4(coeff1, coeff + 256);
+
+ coeff2 = _mm_sub_epi16(b0, b2);
+ coeff3 = _mm_sub_epi16(b1, b3);
+ store_tran_low_offset_4(coeff2, coeff + 512);
+ store_tran_low_offset_4(coeff3, coeff + 768);
+
+ // Increment the pointer by 4 and 12 in alternate iterations(instead of 8)
+ // to ensure the coherency with the implementation of
+ // store_tran_low_offset_4()
+ coeff += (4 + (((idx >> 3) & 1) << 3));
+ t_coeff += 8;
+ }
+}
+
+int aom_satd_sse2(const tran_low_t *coeff, int length) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i accum = zero;
+
+ for (i = 0; i < length; i += 4) {
+ const __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+ const __m128i coeff_sign = _mm_srai_epi32(src_line, 31);
+ const __m128i abs_coeff = invert_sign_32_sse2(src_line, coeff_sign);
+ accum = _mm_add_epi32(accum, abs_coeff);
+ coeff += 4;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+int aom_satd_lp_sse2(const int16_t *coeff, int length) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i accum = zero;
+
+ for (int i = 0; i < length; i += 16) {
+ const __m128i src_line0 = _mm_loadu_si128((const __m128i *)coeff);
+ const __m128i src_line1 = _mm_loadu_si128((const __m128i *)(coeff + 8));
+ const __m128i inv0 = _mm_sub_epi16(zero, src_line0);
+ const __m128i inv1 = _mm_sub_epi16(zero, src_line1);
+ const __m128i abs0 = _mm_max_epi16(src_line0, inv0); // abs(src_line)
+ const __m128i abs1 = _mm_max_epi16(src_line1, inv1); // abs(src_line)
+ const __m128i sum0 = _mm_madd_epi16(abs0, one);
+ const __m128i sum1 = _mm_madd_epi16(abs1, one);
+ accum = _mm_add_epi32(accum, sum0);
+ accum = _mm_add_epi32(accum, sum1);
+ coeff += 16;
+ }
+
+ { // cascading summation of accum
+ __m128i hi = _mm_srli_si128(accum, 8);
+ accum = _mm_add_epi32(accum, hi);
+ hi = _mm_srli_epi64(accum, 32);
+ accum = _mm_add_epi32(accum, hi);
+ }
+
+ return _mm_cvtsi128_si32(accum);
+}
+
+void aom_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width and height to be multiple of 16 and 2
+ // respectively. For any odd width or height, SIMD support needs to be added.
+ assert(width % 16 == 0 && height % 2 == 0);
+ __m128i zero = _mm_setzero_si128();
+
+ for (int wd = 0; wd < width; wd += 16) {
+ const uint8_t *ref_tmp = ref + wd;
+ int16_t *hbuf_tmp = hbuf + wd;
+ __m128i s0 = zero;
+ __m128i s1 = zero;
+ int idx = 0;
+ do {
+ __m128i src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ __m128i t0 = _mm_unpacklo_epi8(src_line, zero);
+ __m128i t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_add_epi16(s0, t0);
+ s1 = _mm_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ t0 = _mm_unpacklo_epi8(src_line, zero);
+ t1 = _mm_unpackhi_epi8(src_line, zero);
+ s0 = _mm_add_epi16(s0, t0);
+ s1 = _mm_add_epi16(s1, t1);
+ ref_tmp += ref_stride;
+ idx += 2;
+ } while (idx < height);
+
+ s0 = _mm_srai_epi16(s0, norm_factor);
+ s1 = _mm_srai_epi16(s1, norm_factor);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp), s0);
+ _mm_storeu_si128((__m128i *)(hbuf_tmp + 8), s1);
+ }
+}
+
+void aom_int_pro_col_sse2(int16_t *vbuf, const uint8_t *ref,
+ const int ref_stride, const int width,
+ const int height, int norm_factor) {
+ // SIMD implementation assumes width to be multiple of 16.
+ assert(width % 16 == 0);
+
+ for (int ht = 0; ht < height; ht++) {
+ const uint8_t *ref_tmp = ref + (ht * ref_stride);
+ __m128i zero = _mm_setzero_si128();
+ __m128i s0 = zero;
+ __m128i s1, src_line;
+ for (int i = 0; i < width; i += 16) {
+ src_line = _mm_loadu_si128((const __m128i *)ref_tmp);
+ s1 = _mm_sad_epu8(src_line, zero);
+ s0 = _mm_add_epi16(s0, s1);
+ ref_tmp += 16;
+ }
+
+ s1 = _mm_srli_si128(s0, 8);
+ s0 = _mm_add_epi16(s0, s1);
+ vbuf[ht] = _mm_cvtsi128_si32(s0) >> norm_factor;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
new file mode 100644
index 0000000000..b83b43122a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/avg_intrin_sse4.c
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// ref: [0 - 510]
+// src: [0 - 510]
+// bwl: {2, 3, 4, 5}
+int aom_vector_var_sse4_1(const int16_t *ref, const int16_t *src, int bwl) {
+ const int width = 4 << bwl;
+ assert(width % 16 == 0);
+
+ const __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+ __m128i mean = _mm_setzero_si128();
+ __m128i sse = _mm_setzero_si128();
+
+ for (int i = 0; i < width; i += 16) {
+ const __m128i src_line = _mm_loadu_si128((const __m128i *)src);
+ const __m128i ref_line = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i src_line2 = _mm_loadu_si128((const __m128i *)(src + 8));
+ const __m128i ref_line2 = _mm_loadu_si128((const __m128i *)(ref + 8));
+ __m128i diff = _mm_sub_epi16(ref_line, src_line);
+ const __m128i diff2 = _mm_sub_epi16(ref_line2, src_line2);
+ __m128i diff_sqr = _mm_madd_epi16(diff, diff);
+ const __m128i diff_sqr2 = _mm_madd_epi16(diff2, diff2);
+
+ diff = _mm_add_epi16(diff, diff2);
+ diff_sqr = _mm_add_epi32(diff_sqr, diff_sqr2);
+ sse = _mm_add_epi32(sse, diff_sqr);
+ mean = _mm_add_epi16(mean, diff);
+
+ src += 16;
+ ref += 16;
+ }
+
+ // m0 m1 m2 m3
+ mean = _mm_madd_epi16(mean, k_one_epi16);
+ // m0+m1 m2+m3 s0+s1 s2+s3
+ __m128i result = _mm_hadd_epi32(mean, sse);
+ // m0+m1+m2+m3 s0+s1+s2+s3 x x
+ result = _mm_add_epi32(result, _mm_bsrli_si128(result, 4));
+
+ // (mean * mean): dynamic range 31 bits.
+ const int mean_int = _mm_extract_epi32(result, 0);
+ const int sse_int = _mm_extract_epi32(result, 2);
+ const unsigned int mean_abs = abs(mean_int);
+ const int var = sse_int - ((mean_abs * mean_abs) >> (bwl + 2));
+ return var;
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
new file mode 100644
index 0000000000..85896e2768
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_avx2.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+static INLINE __m256i load_tran_low(const tran_low_t *a) {
+ const __m256i a_low = _mm256_loadu_si256((const __m256i *)a);
+ const __m256i a_high = _mm256_loadu_si256((const __m256i *)(a + 8));
+ return _mm256_packs_epi32(a_low, a_high);
+}
+
+static INLINE void store_tran_low(__m256i a, tran_low_t *b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_hi = _mm256_mulhi_epi16(a, one);
+ const __m256i a_lo = _mm256_mullo_epi16(a, one);
+ const __m256i a_1 = _mm256_unpacklo_epi16(a_lo, a_hi);
+ const __m256i a_2 = _mm256_unpackhi_epi16(a_lo, a_hi);
+ _mm256_storeu_si256((__m256i *)b, a_1);
+ _mm256_storeu_si256((__m256i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
new file mode 100644
index 0000000000..ff77760b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/bitdepth_conversion_sse2.h
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_config.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+// Load 8 16 bit values. If the source is 32 bits then pack down with
+// saturation.
+static INLINE __m128i load_tran_low(const tran_low_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
+}
+
+static INLINE void unpack_trans(__m128i a, __m128i *a_1, __m128i *a_2) {
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i a_hi = _mm_mulhi_epi16(a, one);
+ const __m128i a_lo = _mm_mullo_epi16(a, one);
+ *a_1 = _mm_unpacklo_epi16(a_lo, a_hi);
+ *a_2 = _mm_unpackhi_epi16(a_lo, a_hi);
+}
+
+// Store 8 16 bit values. If the destination is 32 bits then sign extend the
+// values by multiplying by 1.
+static INLINE void store_tran_low(__m128i a, tran_low_t *b) {
+ __m128i a_1, a_2;
+ unpack_trans(a, &a_1, &a_2);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 4), a_2);
+}
+// Stores the second result at an offset of 8 (instead of 4) to match the output
+// with that of AVX2 implementation and the function is similar to
+// store_tran_low().
+static INLINE void store_tran_low_offset_4(__m128i a, tran_low_t *b) {
+ __m128i a_1, a_2;
+ unpack_trans(a, &a_1, &a_2);
+ _mm_store_si128((__m128i *)(b), a_1);
+ _mm_store_si128((__m128i *)(b + 8), a_2);
+}
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000000..e0289abe12
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "aom/aom_integer.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void aom_blend_a64_hmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ aom_blend_a64_mask_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, 0, w, h, 0, 0);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+void aom_highbd_blend_a64_hmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ aom_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride, src0_8, src0_stride,
+ src1_8, src1_stride, mask, 0, w, h, 0, 0,
+ bd);
+}
+#endif
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
new file mode 100644
index 0000000000..dfbab324d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_avx2.c
@@ -0,0 +1,1374 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+#include <immintrin.h> // AVX2
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w16_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *v_round_offset, const __m256i *v_maxval,
+ int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ __m256i res = _mm256_packus_epi16(res0, res0);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm_storeu_si128((__m128i *)(dst), _mm256_castsi256_si128(res));
+}
+
+static INLINE void blend_a64_d16_mask_w32_avx2(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m256i *m0, const __m256i *m1, const __m256i *v_round_offset,
+ const __m256i *v_maxval, int shift) {
+ const __m256i max_minus_m0 = _mm256_sub_epi16(*v_maxval, *m0);
+ const __m256i max_minus_m1 = _mm256_sub_epi16(*v_maxval, *m1);
+ const __m256i s0_0 = yy_loadu_256(src0);
+ const __m256i s0_1 = yy_loadu_256(src0 + 16);
+ const __m256i s1_0 = yy_loadu_256(src1);
+ const __m256i s1_1 = yy_loadu_256(src1 + 16);
+ __m256i res0_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_0, s1_0),
+ _mm256_unpacklo_epi16(*m0, max_minus_m0));
+ __m256i res0_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_0, s1_0),
+ _mm256_unpackhi_epi16(*m0, max_minus_m0));
+ __m256i res1_lo = _mm256_madd_epi16(_mm256_unpacklo_epi16(s0_1, s1_1),
+ _mm256_unpacklo_epi16(*m1, max_minus_m1));
+ __m256i res1_hi = _mm256_madd_epi16(_mm256_unpackhi_epi16(s0_1, s1_1),
+ _mm256_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi =
+ _mm256_srai_epi32(_mm256_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m256i res0 = _mm256_packs_epi32(res0_lo, res0_hi);
+ const __m256i res1 = _mm256_packs_epi32(res1_lo, res1_hi);
+ __m256i res = _mm256_packus_epi16(res0, res1);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ _mm256_storeu_si256((__m256i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m = xx_loadu_128(mask);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m = yy_loadu_256(mask + j);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m));
+ const __m256i m1 = _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m256i m_i00 = yy_loadu_256(mask);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_avx2(dst, src0, src1, &m0, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + 2 * j);
+ const __m256i m_i11 = yy_loadu_256(mask + mask_stride + 2 * j + 32);
+
+ const __m256i m0_ac = _mm256_adds_epu8(m_i00, m_i10);
+ const __m256i m1_ac = _mm256_adds_epu8(m_i01, m_i11);
+ const __m256i m0_acbd = _mm256_maddubs_epi16(m0_ac, one_b);
+ const __m256i m1_acbd = _mm256_maddubs_epi16(m1_ac, one_b);
+ const __m256i m0 = _mm256_srli_epi16(_mm256_add_epi16(m0_acbd, two_w), 2);
+ const __m256i m1 = _mm256_srli_epi16(_mm256_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + 2 * j);
+ const __m256i m_i01 = yy_loadu_256(mask + 2 * j + 32);
+ const __m256i m0_ac = _mm256_maddubs_epi16(m_i00, one_b);
+ const __m256i m1_ac = _mm256_maddubs_epi16(m_i01, one_b);
+ const __m256i m0 = _mm256_avg_epu16(m0_ac, zeros);
+ const __m256i m1 = _mm256_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(m_ac);
+
+ blend_a64_d16_mask_w16_avx2(dst + j, src0 + j, src1 + j, &m0,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m256i *round_offset, int shift) {
+ const __m256i v_maxval = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i zeros = _mm256_setzero_si256();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 32) {
+ const __m256i m_i00 = yy_loadu_256(mask + j);
+ const __m256i m_i10 = yy_loadu_256(mask + mask_stride + j);
+
+ const __m256i m_ac =
+ _mm256_avg_epu8(_mm256_adds_epu8(m_i00, m_i10), zeros);
+ const __m256i m0 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(m_ac));
+ const __m256i m1 =
+ _mm256_cvtepu8_epi16(_mm256_extracti128_si256(m_ac, 1));
+
+ blend_a64_d16_mask_w32_avx2(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+void aom_lowbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const __m256i y_round_offset = _mm256_set1_epi32(round_offset);
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 16:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w32_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &y_round_offset, shift);
+ break;
+ }
+ }
+}
+
+static INLINE __m256i blend_16_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = _mm256_castsi128_si256(xx_loadu_128(src0));
+ const __m256i v_s1_b = _mm256_castsi128_si256(xx_loadu_128(src1));
+ const __m256i v_s0_s_b = _mm256_permute4x64_epi64(v_s0_b, 0xd8);
+ const __m256i v_s1_s_b = _mm256_permute4x64_epi64(v_s1_b, 0xd8);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_s_b, v_s1_s_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res_b = _mm256_packus_epi16(v_res0_w, v_res0_w);
+ const __m256i v_res = _mm256_permute4x64_epi64(v_res_b, 0xd8);
+ return v_res;
+}
+
+static INLINE __m256i blend_32_u8_avx2(const uint8_t *src0, const uint8_t *src1,
+ const __m256i *v_m0_b,
+ const __m256i *v_m1_b,
+ const int32_t bits) {
+ const __m256i v_s0_b = yy_loadu_256(src0);
+ const __m256i v_s1_b = yy_loadu_256(src1);
+
+ const __m256i v_p0_w =
+ _mm256_maddubs_epi16(_mm256_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm256_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m256i v_p1_w =
+ _mm256_maddubs_epi16(_mm256_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm256_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m256i v_res0_w = yy_roundn_epu16(v_p0_w, bits);
+ const __m256i v_res1_w = yy_roundn_epu16(v_p1_w, bits);
+ const __m256i v_res = _mm256_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
+static INLINE void blend_a64_mask_sx_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m256i v_ral_b = yy_loadu_256(mask);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+
+ const __m256i v_m0_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, v_m0_w);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i y_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(y_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xFF);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ral_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_rah_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_rbl_b = yy_loadu_256(mask + mask_stride + 2 * c);
+ const __m256i v_rbh_b = yy_loadu_256(mask + mask_stride + 2 * c + 32);
+ const __m256i v_rvsl_b = _mm256_add_epi8(v_ral_b, v_rbl_b);
+ const __m256i v_rvsh_b = _mm256_add_epi8(v_rah_b, v_rbh_b);
+ const __m256i v_rvsal_w = _mm256_and_si256(v_rvsl_b, v_zmask_b);
+ const __m256i v_rvsah_w = _mm256_and_si256(v_rvsh_b, v_zmask_b);
+ const __m256i v_rvsbl_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsl_b, 1), v_zmask_b);
+ const __m256i v_rvsbh_w =
+ _mm256_and_si256(_mm256_srli_si256(v_rvsh_b, 1), v_zmask_b);
+ const __m256i v_rsl_w = _mm256_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m256i v_rsh_w = _mm256_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m256i v_m0l_w = yy_roundn_epu16(v_rsl_w, 2);
+ const __m256i v_m0h_w = yy_roundn_epu16(v_rsh_w, 2);
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(v_m0l_w, v_m0h_w), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sx_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m256i v_zmask_b = _mm256_set1_epi16(0xff);
+ do {
+ const __m256i v_rl_b = yy_loadu_256(mask);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_rl_b, _mm256_srli_si256(v_rl_b, 1));
+
+ const __m256i v_m0_w = _mm256_and_si256(v_al_b, v_zmask_b);
+ const __m256i v_m0_b = _mm256_packus_epi16(v_m0_w, _mm256_setzero_si256());
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_16_u8_avx2(src0, src1, &v_m0_b, &v_m1_b,
+ AOM_BLEND_A64_ROUND_BITS);
+
+ xx_storeu_128(dst, _mm256_castsi256_si128(v_res_b));
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_shuffle_b = yy_loadu_256(g_blend_a64_mask_shuffle);
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_r0_b = yy_loadu_256(mask + 2 * c);
+ const __m256i v_r1_b = yy_loadu_256(mask + 2 * c + 32);
+ const __m256i v_r0_s_b = _mm256_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m256i v_r1_s_b = _mm256_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m256i v_al_b =
+ _mm256_avg_epu8(v_r0_s_b, _mm256_srli_si256(v_r0_s_b, 8));
+ const __m256i v_ah_b =
+ _mm256_avg_epu8(v_r1_s_b, _mm256_srli_si256(v_r1_s_b, 8));
+
+ const __m256i v_m0_b =
+ _mm256_permute4x64_epi64(_mm256_unpacklo_epi64(v_al_b, v_ah_b), 0xd8);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sx_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sx_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sx_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ break;
+ }
+}
+
+static INLINE void blend_a64_mask_sy_w16_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m1_b = _mm_sub_epi16(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_ra_b = yy_loadu_256(mask + c);
+ const __m256i v_rb_b = yy_loadu_256(mask + c + mask_stride);
+ const __m256i v_m0_b = _mm256_avg_epu8(v_ra_b, v_rb_b);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_sy_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ blend_a64_mask_sy_w16_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h);
+ break;
+ default:
+ blend_a64_mask_sy_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+static INLINE void blend_a64_mask_w32n_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m256i v_maxval_b = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ do {
+ int c;
+ for (c = 0; c < w; c += 32) {
+ const __m256i v_m0_b = yy_loadu_256(mask + c);
+ const __m256i v_m1_b = _mm256_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m256i v_res_b = blend_32_u8_avx2(
+ src0 + c, src1 + c, &v_m0_b, &v_m1_b, AOM_BLEND_A64_ROUND_BITS);
+
+ yy_storeu_256(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static INLINE void blend_a64_mask_avx2(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ switch (w) {
+ case 4:
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 8:
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ case 16:
+ do {
+ const __m128i v_m0_b = xx_loadu_128(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_16_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst, v_res_b);
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+ break;
+ default:
+ blend_a64_mask_w32n_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ }
+}
+
+void aom_blend_a64_mask_avx2(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subw, subh);
+ } else {
+ if (subw & subh) {
+ blend_a64_mask_sx_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subw) {
+ blend_a64_mask_sx_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else if (subh) {
+ blend_a64_mask_sy_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h);
+ } else {
+ blend_a64_mask_avx2(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_avx2()
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void highbd_blend_a64_d16_mask_w4_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ // Load 4x u16 pixels from each of 4 rows from each source
+ const __m256i s0 = _mm256_set_epi64x(*(int64_t *)(src0 + 3 * src0_stride),
+ *(int64_t *)(src0 + 2 * src0_stride),
+ *(int64_t *)(src0 + 1 * src0_stride),
+ *(int64_t *)(src0 + 0 * src0_stride));
+ const __m256i s1 = _mm256_set_epi64x(*(int64_t *)(src1 + 3 * src1_stride),
+ *(int64_t *)(src1 + 2 * src1_stride),
+ *(int64_t *)(src1 + 1 * src1_stride),
+ *(int64_t *)(src1 + 0 * src1_stride));
+ // Generate the inverse mask
+ const __m256i mask1 = _mm256_sub_epi16(*mask_max, *mask0);
+
+ // Multiply each mask by the respective source
+ const __m256i mul0_highs = _mm256_mulhi_epu16(*mask0, s0);
+ const __m256i mul0_lows = _mm256_mullo_epi16(*mask0, s0);
+ const __m256i mul0h = _mm256_unpackhi_epi16(mul0_lows, mul0_highs);
+ const __m256i mul0l = _mm256_unpacklo_epi16(mul0_lows, mul0_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1_highs = _mm256_mulhi_epu16(mask1, s1);
+ const __m256i mul1_lows = _mm256_mullo_epi16(mask1, s1);
+ const __m256i mul1h = _mm256_unpackhi_epi16(mul1_lows, mul1_highs);
+ const __m256i mul1l = _mm256_unpacklo_epi16(mul1_lows, mul1_highs);
+
+ const __m256i sumh = _mm256_add_epi32(mul0h, mul1h);
+ const __m256i suml = _mm256_add_epi32(mul0l, mul1l);
+
+ const __m256i roundh =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumh, *round_offset), shift);
+ const __m256i roundl =
+ _mm256_srai_epi32(_mm256_sub_epi32(suml, *round_offset), shift);
+
+ const __m256i pack = _mm256_packs_epi32(roundl, roundh);
+ const __m256i clip =
+ _mm256_min_epi16(_mm256_max_epi16(pack, *clip_low), *clip_high);
+
+ // _mm256_extract_epi64 doesn't exist on x86, so do it the old-fashioned way:
+ const __m128i cliph = _mm256_extracti128_si256(clip, 1);
+ xx_storel_64(dst + 3 * dst_stride, _mm_srli_si128(cliph, 8));
+ xx_storel_64(dst + 2 * dst_stride, cliph);
+ const __m128i clipl = _mm256_castsi256_si128(clip);
+ xx_storel_64(dst + 1 * dst_stride, _mm_srli_si128(clipl, 8));
+ xx_storel_64(dst + 0 * dst_stride, clipl);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ do {
+ // Load 8x u8 pixels from each of 4 rows of the mask, pad each to u16
+ const __m128i mask08 = _mm_set_epi32(*(int32_t *)(mask + 3 * mask_stride),
+ *(int32_t *)(mask + 2 * mask_stride),
+ *(int32_t *)(mask + 1 * mask_stride),
+ *(int32_t *)(mask + 0 * mask_stride));
+ const __m256i mask0 = _mm256_cvtepu8_epi16(mask08);
+
+ highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, &mask0, round_offset, shift,
+ clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m256i *round_offset, int shift, const __m256i *clip_low,
+ const __m256i *clip_high, const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ do {
+ // Load 8 pixels from each of 8 rows of mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m256i m0246 =
+ _mm256_set_epi64x(*(int64_t *)(mask + 6 * mask_stride),
+ *(int64_t *)(mask + 4 * mask_stride),
+ *(int64_t *)(mask + 2 * mask_stride),
+ *(int64_t *)(mask + 0 * mask_stride));
+ const __m256i m1357 =
+ _mm256_set_epi64x(*(int64_t *)(mask + 7 * mask_stride),
+ *(int64_t *)(mask + 5 * mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride),
+ *(int64_t *)(mask + 1 * mask_stride));
+ const __m256i addrows = _mm256_adds_epu8(m0246, m1357);
+ const __m256i adjacent = _mm256_maddubs_epi16(addrows, one_b);
+ const __m256i mask0 =
+ _mm256_srli_epi16(_mm256_add_epi16(adjacent, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w4_avx2(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, &mask0, round_offset, shift,
+ clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+ const __m256i *mask0b, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ // Load 8x u16 pixels from each of 4 rows from each source
+ const __m256i s0a =
+ yy_loadu2_128(src0 + 0 * src0_stride, src0 + 1 * src0_stride);
+ const __m256i s0b =
+ yy_loadu2_128(src0 + 2 * src0_stride, src0 + 3 * src0_stride);
+ const __m256i s1a =
+ yy_loadu2_128(src1 + 0 * src1_stride, src1 + 1 * src1_stride);
+ const __m256i s1b =
+ yy_loadu2_128(src1 + 2 * src1_stride, src1 + 3 * src1_stride);
+
+ // Generate inverse masks
+ const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+ const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply sources by respective masks
+ const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+ const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+ const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+ const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+ const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m256i sumah = _mm256_add_epi32(mul0ah, mul1ah);
+ const __m256i sumal = _mm256_add_epi32(mul0al, mul1al);
+
+ const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+ const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+ const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+ const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+ const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m256i sumbh = _mm256_add_epi32(mul0bh, mul1bh);
+ const __m256i sumbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+ // Divide down each result, with rounding
+ const __m256i roundah =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumah, *round_offset), shift);
+ const __m256i roundal =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumal, *round_offset), shift);
+ const __m256i roundbh =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumbh, *round_offset), shift);
+ const __m256i roundbl =
+ _mm256_srai_epi32(_mm256_sub_epi32(sumbl, *round_offset), shift);
+
+ // Pack each i32 down to an i16 with saturation, then clip to valid range
+ const __m256i packa = _mm256_packs_epi32(roundal, roundah);
+ const __m256i clipa =
+ _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+ const __m256i packb = _mm256_packs_epi32(roundbl, roundbh);
+ const __m256i clipb =
+ _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+ // Store 8x u16 pixels to each of 4 rows in the destination
+ yy_storeu2_128(dst + 0 * dst_stride, dst + 1 * dst_stride, clipa);
+ yy_storeu2_128(dst + 2 * dst_stride, dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ do {
+ // Load 8x u8 pixels from each of 4 rows in the mask
+ const __m128i mask0a8 =
+ _mm_set_epi64x(*(int64_t *)mask, *(uint64_t *)(mask + mask_stride));
+ const __m128i mask0b8 =
+ _mm_set_epi64x(*(int64_t *)(mask + 2 * mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride));
+ const __m256i mask0a = _mm256_cvtepu8_epi16(mask0a8);
+ const __m256i mask0b = _mm256_cvtepu8_epi16(mask0b8);
+
+ highbd_blend_a64_d16_mask_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ do {
+ // Load 16x u8 pixels from each of 8 rows in the mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m256i m02 =
+ yy_loadu2_128(mask + 0 * mask_stride, mask + 2 * mask_stride);
+ const __m256i m13 =
+ yy_loadu2_128(mask + 1 * mask_stride, mask + 3 * mask_stride);
+ const __m256i m0123 =
+ _mm256_maddubs_epi16(_mm256_adds_epu8(m02, m13), one_b);
+ const __m256i mask_0a =
+ _mm256_srli_epi16(_mm256_add_epi16(m0123, two_w), 2);
+ const __m256i m46 =
+ yy_loadu2_128(mask + 4 * mask_stride, mask + 6 * mask_stride);
+ const __m256i m57 =
+ yy_loadu2_128(mask + 5 * mask_stride, mask + 7 * mask_stride);
+ const __m256i m4567 =
+ _mm256_maddubs_epi16(_mm256_adds_epu8(m46, m57), one_b);
+ const __m256i mask_0b =
+ _mm256_srli_epi16(_mm256_add_epi16(m4567, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+ &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m256i *mask0a,
+ const __m256i *mask0b, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ // Load 16x pixels from each of 2 rows from each source
+ const __m256i s0a = yy_loadu_256(src0);
+ const __m256i s0b = yy_loadu_256(src0 + src0_stride);
+ const __m256i s1a = yy_loadu_256(src1);
+ const __m256i s1b = yy_loadu_256(src1 + src1_stride);
+
+ // Calculate inverse masks
+ const __m256i mask1a = _mm256_sub_epi16(*mask_max, *mask0a);
+ const __m256i mask1b = _mm256_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply each source by appropriate mask
+ const __m256i mul0a_highs = _mm256_mulhi_epu16(*mask0a, s0a);
+ const __m256i mul0a_lows = _mm256_mullo_epi16(*mask0a, s0a);
+ const __m256i mul0ah = _mm256_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m256i mul0al = _mm256_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ // Note that AVX2 unpack orders 64-bit words as [3 1] [2 0] to keep within
+ // lanes Later, packs does the same again which cancels this out with no need
+ // for a permute. The intermediate values being reordered makes no difference
+
+ const __m256i mul1a_highs = _mm256_mulhi_epu16(mask1a, s1a);
+ const __m256i mul1a_lows = _mm256_mullo_epi16(mask1a, s1a);
+ const __m256i mul1ah = _mm256_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m256i mul1al = _mm256_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m256i mulah = _mm256_add_epi32(mul0ah, mul1ah);
+ const __m256i mulal = _mm256_add_epi32(mul0al, mul1al);
+
+ const __m256i mul0b_highs = _mm256_mulhi_epu16(*mask0b, s0b);
+ const __m256i mul0b_lows = _mm256_mullo_epi16(*mask0b, s0b);
+ const __m256i mul0bh = _mm256_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m256i mul0bl = _mm256_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m256i mul1b_highs = _mm256_mulhi_epu16(mask1b, s1b);
+ const __m256i mul1b_lows = _mm256_mullo_epi16(mask1b, s1b);
+ const __m256i mul1bh = _mm256_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m256i mul1bl = _mm256_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m256i mulbh = _mm256_add_epi32(mul0bh, mul1bh);
+ const __m256i mulbl = _mm256_add_epi32(mul0bl, mul1bl);
+
+ const __m256i resah =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulah, *round_offset), shift);
+ const __m256i resal =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulal, *round_offset), shift);
+ const __m256i resbh =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulbh, *round_offset), shift);
+ const __m256i resbl =
+ _mm256_srai_epi32(_mm256_sub_epi32(mulbl, *round_offset), shift);
+
+ // Signed saturating pack from i32 to i16:
+ const __m256i packa = _mm256_packs_epi32(resal, resah);
+ const __m256i packb = _mm256_packs_epi32(resbl, resbh);
+
+ // Clip the values to the valid range
+ const __m256i clipa =
+ _mm256_min_epi16(_mm256_max_epi16(packa, *clip_low), *clip_high);
+ const __m256i clipb =
+ _mm256_min_epi16(_mm256_max_epi16(packb, *clip_low), *clip_high);
+
+ // Store 16 pixels
+ yy_storeu_256(dst, clipa);
+ yy_storeu_256(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ for (int i = 0; i < h; i += 2) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 16x u8 alpha-mask values from each of two rows and pad to u16
+ const __m128i masks_a8 = xx_loadu_128(mask + j);
+ const __m128i masks_b8 = xx_loadu_128(mask + mask_stride + j);
+ const __m256i mask0a = _mm256_cvtepu8_epi16(masks_a8);
+ const __m256i mask0b = _mm256_cvtepu8_epi16(masks_b8);
+
+ highbd_blend_a64_d16_mask_w16_avx2(
+ dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+ &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 2;
+ }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const uint8_t *mask,
+ int mask_stride, int h, int w, const __m256i *round_offset, int shift,
+ const __m256i *clip_low, const __m256i *clip_high,
+ const __m256i *mask_max) {
+ const __m256i one_b = _mm256_set1_epi8(1);
+ const __m256i two_w = _mm256_set1_epi16(2);
+ for (int i = 0; i < h; i += 2) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 32x u8 alpha-mask values from each of four rows
+ // (saturating) add pairs of rows, then use madd to add adjacent values
+ // Finally, divide down each result with rounding
+ const __m256i m0 = yy_loadu_256(mask + 0 * mask_stride + 2 * j);
+ const __m256i m1 = yy_loadu_256(mask + 1 * mask_stride + 2 * j);
+ const __m256i m2 = yy_loadu_256(mask + 2 * mask_stride + 2 * j);
+ const __m256i m3 = yy_loadu_256(mask + 3 * mask_stride + 2 * j);
+
+ const __m256i m01_8 = _mm256_adds_epu8(m0, m1);
+ const __m256i m23_8 = _mm256_adds_epu8(m2, m3);
+
+ const __m256i m01 = _mm256_maddubs_epi16(m01_8, one_b);
+ const __m256i m23 = _mm256_maddubs_epi16(m23_8, one_b);
+
+ const __m256i mask0a = _mm256_srli_epi16(_mm256_add_epi16(m01, two_w), 2);
+ const __m256i mask0b = _mm256_srli_epi16(_mm256_add_epi16(m23, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w16_avx2(
+ dst + j, dst_stride, src0 + j, src0_stride, src1 + j, src1_stride,
+ &mask0a, &mask0b, round_offset, shift, clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 4;
+ }
+}
+
+void aom_highbd_blend_a64_d16_mask_avx2(
+ uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int32_t round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+ const __m256i v_round_offset = _mm256_set1_epi32(round_offset);
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+ const __m256i clip_low = _mm256_setzero_si256();
+ const __m256i clip_high = _mm256_set1_epi16((1 << bd) - 1);
+ const __m256i mask_max = _mm256_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw0_subh0_w4_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw0_subh0_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >= 16
+ highbd_blend_a64_d16_mask_subw0_subh0_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw1_subh1_w4_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw1_subh1_w8_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >= 16
+ highbd_blend_a64_d16_mask_subw1_subh1_w16_avx2(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+ } else {
+ // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+ // back to the vanilla C implementation instead of having all the optimised
+ // code for these.
+ aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, conv_params, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 0000000000..58a7345ec2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1560 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+#include "aom_dsp/x86/blend_mask_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h) {
+ (void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int w, int h) {
+ (void)w;
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_m0_b = xx_loadl_64(mask);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_m0_b = xx_loadu_128(mask + c);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_r_b = xx_loadu_128(mask);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_r0_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b);
+ const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b);
+ const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b);
+ const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ra_b = xx_loadu_128(mask + c);
+ const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+ const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = xx_loadu_128(mask);
+ const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b);
+ const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b);
+ const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8));
+ const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w);
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+ const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+ const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+ const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+ const __m128i v_rvsbl_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b);
+ const __m128i v_rvsbh_w =
+ _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b);
+ const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w);
+ const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b);
+
+ const __m128i v_res_b =
+ blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w,
+ int h, int subw, int subh) {
+ typedef void (*blend_fn)(
+ uint8_t * dst, uint32_t dst_stride, const uint8_t *src0,
+ uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+ // Dimensions are: width_index X subx X suby
+ static const blend_fn blend[3][2][2] = {
+ { // w % 16 == 0
+ { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 },
+ { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 },
+ { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } },
+ { // w == 8
+ { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 },
+ { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h, subw, subh);
+ } else {
+ blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0,
+ src0_stride, src1, src1_stride,
+ mask, mask_stride, w, h);
+ }
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_b = xx_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_m0_b = xx_loadl_64(mask + c);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_r_b = xx_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_32(mask);
+ const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadl_64(mask + c);
+ const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_ra_b = xx_loadl_64(mask);
+ const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ (void)w;
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h,
+ blend_unit_fn blend) {
+ const __m128i v_zmask_b =
+ _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1);
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w =
+ _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h) {
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8,
+ uint32_t src0_stride,
+ const uint8_t *src1_8,
+ uint32_t src1_stride, const uint8_t *mask,
+ uint32_t mask_stride, int w, int h,
+ int subw, int subh, int bd) {
+ typedef void (*blend_fn)(
+ uint16_t * dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h);
+
+ // Dimensions are: bd_index X width_index X subw X subh
+ static const blend_fn blend[2][2][2][2] = {
+ { // bd == 8 or 10
+ { // w % 8 == 0
+ { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 },
+ { blend_a64_mask_b10_sx_w8n_sse4_1,
+ blend_a64_mask_b10_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 },
+ { blend_a64_mask_b10_sx_w4_sse4_1,
+ blend_a64_mask_b10_sx_sy_w4_sse4_1 } } },
+ { // bd == 12
+ { // w % 8 == 0
+ { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 },
+ { blend_a64_mask_b12_sx_w8n_sse4_1,
+ blend_a64_mask_b12_sx_sy_w8n_sse4_1 } },
+ { // w == 4
+ { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 },
+ { blend_a64_mask_b12_sx_w4_sse4_1,
+ blend_a64_mask_b12_sx_sy_w4_sse4_1 } } }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0](
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, w, h);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+static INLINE void blend_a64_d16_mask_w16_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset,
+ const __m128i *v_maxval, int shift) {
+ const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0);
+ const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1);
+ const __m128i s0_0 = xx_loadu_128(src0);
+ const __m128i s0_1 = xx_loadu_128(src0 + 8);
+ const __m128i s1_0 = xx_loadu_128(src1);
+ const __m128i s1_1 = xx_loadu_128(src1 + 8);
+ __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0),
+ _mm_unpacklo_epi16(*m0, max_minus_m0));
+ __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0),
+ _mm_unpackhi_epi16(*m0, max_minus_m0));
+ __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1),
+ _mm_unpacklo_epi16(*m1, max_minus_m1));
+ __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1),
+ _mm_unpackhi_epi16(*m1, max_minus_m1));
+ res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift);
+ res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift);
+ res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift);
+ res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift);
+ const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi);
+ const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi);
+ const __m128i res = _mm_packus_epi16(res0, res1);
+
+ _mm_storeu_si128((__m128i *)(dst), res);
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m = xx_loadu_128(mask + j);
+ const __m128i m0 = _mm_cvtepu8_epi16(m);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b);
+ const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b);
+ const __m128i m0 = _mm_avg_epu16(m0_ac, zeros);
+ const __m128i m1 = _mm_avg_epu16(m1_ac, zeros);
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + j);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j);
+
+ const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros);
+ const __m128i m0 = _mm_cvtepu8_epi16(m_ac);
+ const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8));
+
+ blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1,
+ round_offset, &v_maxval, shift);
+ }
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+void aom_lowbd_blend_a64_d16_mask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params) {
+ const int bd = 8;
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+
+ const int round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ } else if (subw == 1 && subh == 0) {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ } else {
+ switch (w) {
+ case 4:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ case 8:
+ aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift);
+ break;
+ default:
+ lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift);
+ break;
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// aom_highbd_blend_a64_d16_mask_sse4_1()
+//////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_blend_a64_d16_mask_w4_sse4_1(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+ const __m128i *mask0b, const __m128i *round_offset, int shift,
+ const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *mask_max) {
+ // Load 4 pixels from each of 4 rows from each source
+ const __m128i s0a =
+ _mm_set_epi64x(*(int64_t *)src0, *(int64_t *)(src0 + src0_stride));
+ const __m128i s0b = _mm_set_epi64x(*(int64_t *)(src0 + 2 * src0_stride),
+ *(int64_t *)(src0 + 3 * src0_stride));
+ const __m128i s1a =
+ _mm_set_epi64x(*(int64_t *)(src1), *(int64_t *)(src1 + src1_stride));
+ const __m128i s1b = _mm_set_epi64x(*(int64_t *)(src1 + 2 * src1_stride),
+ *(int64_t *)(src1 + 3 * src1_stride));
+
+ // Generate the inverse masks
+ const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a);
+ const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b);
+
+ // Multiply each mask by the respective source
+ const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+ const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+ const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+ const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+ const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+ const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+ const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+ const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+ const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+ const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+ const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+ const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+ const __m128i roundah =
+ _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+ const __m128i roundbh =
+ _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+ const __m128i roundal =
+ _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+ const __m128i roundbl =
+ _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+ const __m128i packa = _mm_packs_epi32(roundal, roundah);
+ const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+
+ const __m128i clipa =
+ _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+ const __m128i clipb =
+ _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+ xx_storel_64(dst, _mm_srli_si128(clipa, 8));
+ xx_storel_64(dst + dst_stride, clipa);
+ xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8));
+ xx_storel_64(dst + 3 * dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ do {
+ const __m128i mask0a8 =
+ _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride));
+ const __m128i mask0b8 =
+ _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride),
+ *(int32_t *)(mask + 3 * mask_stride));
+ const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8);
+ const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8);
+
+ highbd_blend_a64_d16_mask_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 4;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ do {
+ // Load 8 pixels from each of 8 rows of mask,
+ // (saturating) add together rows then use madd to add adjacent pixels
+ // Finally, divide each value by 4 (with rounding)
+ const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask),
+ *(int64_t *)(mask + 2 * mask_stride));
+ const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride),
+ *(int64_t *)(mask + 3 * mask_stride));
+ const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b);
+ const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2);
+ const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride),
+ *(int64_t *)(mask + 6 * mask_stride));
+ const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride),
+ *(int64_t *)(mask + 7 * mask_stride));
+ const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b);
+ const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a,
+ &mask_0b, round_offset, shift, clip_low, clip_high, mask_max);
+
+ dst += dst_stride * 4;
+ src0 += src0_stride * 4;
+ src1 += src1_stride * 4;
+ mask += mask_stride * 8;
+ } while (h -= 4);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w8_sse4_1(
+ uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride,
+ const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a,
+ const __m128i *mask0b, const __m128i *round_offset, int shift,
+ const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *max_mask) {
+ // Load 8x pixels from each of 2 rows from each source
+ const __m128i s0a = xx_loadu_128(src0);
+ const __m128i s0b = xx_loadu_128(src0 + src0_stride);
+ const __m128i s1a = xx_loadu_128(src1);
+ const __m128i s1b = xx_loadu_128(src1 + src1_stride);
+
+ // Generate inverse masks
+ const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a);
+ const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b);
+
+ // Multiply sources by respective masks
+ const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a);
+ const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a);
+ const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs);
+ const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs);
+
+ const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a);
+ const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a);
+ const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs);
+ const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs);
+
+ const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah);
+ const __m128i sumal = _mm_add_epi32(mul0al, mul1al);
+
+ const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b);
+ const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b);
+ const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs);
+ const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs);
+
+ const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b);
+ const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b);
+ const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs);
+ const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs);
+
+ const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh);
+ const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl);
+
+ const __m128i roundah =
+ _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift);
+ const __m128i roundal =
+ _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift);
+ const __m128i roundbh =
+ _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift);
+ const __m128i roundbl =
+ _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift);
+
+ const __m128i packa = _mm_packs_epi32(roundal, roundah);
+ const __m128i clipa =
+ _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high);
+ const __m128i packb = _mm_packs_epi32(roundbl, roundbh);
+ const __m128i clipb =
+ _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high);
+
+ xx_storeu_128(dst, clipa);
+ xx_storeu_128(dst + dst_stride, clipb);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *max_mask) {
+ do {
+ const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask));
+ const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride));
+ highbd_blend_a64_d16_mask_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b,
+ round_offset, shift, clip_low, clip_high, max_mask);
+
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 2;
+ } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *max_mask) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ do {
+ const __m128i mask_thisrowa = xx_loadu_128(mask);
+ const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride);
+ const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride);
+ const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride);
+ const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa);
+ const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb);
+ const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b);
+ const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b);
+ const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2);
+ const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa,
+ &mask_sb, round_offset, shift, clip_low, clip_high, max_mask);
+
+ dst += dst_stride * 2;
+ src0 += src0_stride * 2;
+ src1 += src1_stride * 2;
+ mask += mask_stride * 4;
+ } while (h -= 2);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_w16_sse4_1(
+ uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *round_offset, int shift, const __m128i *mask0l,
+ const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high,
+ const __m128i *mask_max) {
+ // Load 16x u16 pixels for this row from each src
+ const __m128i s0l = xx_loadu_128(src0);
+ const __m128i s0h = xx_loadu_128(src0 + 8);
+ const __m128i s1l = xx_loadu_128(src1);
+ const __m128i s1h = xx_loadu_128(src1 + 8);
+
+ // Calculate inverse masks
+ const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h);
+ const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l);
+
+ const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h);
+ const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h);
+ const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs);
+ const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs);
+
+ const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h);
+ const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h);
+ const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs);
+ const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs);
+
+ const __m128i mulhh = _mm_add_epi32(mul0h, mul1h);
+ const __m128i mulhl = _mm_add_epi32(mul0l, mul1l);
+
+ const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l);
+ const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l);
+ const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs);
+ const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs);
+
+ const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l);
+ const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l);
+ const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs);
+ const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs);
+
+ const __m128i mullh = _mm_add_epi32(mul2h, mul3h);
+ const __m128i mulll = _mm_add_epi32(mul2l, mul3l);
+
+ const __m128i reshh =
+ _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift);
+ const __m128i reshl =
+ _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift);
+ const __m128i reslh =
+ _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift);
+ const __m128i resll =
+ _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift);
+
+ // Signed saturating pack from i32 to i16:
+ const __m128i packh = _mm_packs_epi32(reshl, reshh);
+ const __m128i packl = _mm_packs_epi32(resll, reslh);
+
+ // Clip the values to the valid range
+ const __m128i cliph =
+ _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high);
+ const __m128i clipl =
+ _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high);
+
+ // Store 16 pixels
+ xx_storeu_128(dst, clipl);
+ xx_storeu_128(dst + 8, cliph);
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 16) {
+ // Load 16x u8 alpha-mask values and pad to u16
+ const __m128i masks_u8 = xx_loadu_128(mask + j);
+ const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8);
+ const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8));
+
+ highbd_blend_a64_d16_mask_w16_sse4_1(
+ dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h,
+ clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ }
+}
+
+static INLINE void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h, int w,
+ const __m128i *round_offset, int shift, const __m128i *clip_low,
+ const __m128i *clip_high, const __m128i *mask_max) {
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; i++) {
+ for (int j = 0; j < w; j += 16) {
+ const __m128i m_i00 = xx_loadu_128(mask + 2 * j);
+ const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16);
+ const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j);
+ const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16);
+
+ const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10);
+ const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11);
+ const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b);
+ const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b);
+ const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2);
+ const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2);
+
+ highbd_blend_a64_d16_mask_w16_sse4_1(
+ dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h,
+ clip_low, clip_high, mask_max);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride * 2;
+ }
+}
+
+void aom_highbd_blend_a64_d16_mask_sse4_1(
+ uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh,
+ ConvolveParams *conv_params, const int bd) {
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+ const int round_bits =
+ 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
+ const int32_t round_offset =
+ ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) -
+ (1 << (round_bits - 1)))
+ << AOM_BLEND_A64_ROUND_BITS;
+ const __m128i v_round_offset = _mm_set1_epi32(round_offset);
+ const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS;
+
+ const __m128i clip_low = _mm_setzero_si128();
+ const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1);
+ const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >=16
+ highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+
+ } else if (subw == 1 && subh == 1) {
+ switch (w) {
+ case 4:
+ highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ case 8:
+ highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ default: // >=16
+ highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1(
+ dst, dst_stride, src0, src0_stride, src1, src1_stride, mask,
+ mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high,
+ &mask_max);
+ break;
+ }
+ } else {
+ // Sub-sampling in only one axis doesn't seem to happen very much, so fall
+ // back to the vanilla C implementation instead of having all the optimised
+ // code for these.
+ aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, w, h, subw,
+ subh, conv_params, bd);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000000..75fb1c5a94
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,285 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/blend_sse4.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0,
+ uint32_t src0_stride,
+ const uint8_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w);
+ const __m128i v_resh_w =
+ blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ typedef void (*blend_fn)(uint8_t * dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h);
+
+ // Dimension: width_index
+ static const blend_fn blend[9] = {
+ blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
+ aom_blend_a64_vmask_c, // w == 1
+ aom_blend_a64_vmask_c, // w == 2
+ NULL, // INVALID
+ blend_a64_vmask_w4_sse4_1, // w == 4
+ NULL, // INVALID
+ NULL, // INVALID
+ NULL, // INVALID
+ blend_a64_vmask_w8_sse4_1, // w == 8
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w,
+ h);
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, blend_4_b12);
+}
+
+static INLINE void blend_a64_vmask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride, const uint16_t *src0,
+ uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h, blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0,
+ uint32_t src0_stride,
+ const uint16_t *src1,
+ uint32_t src1_stride,
+ const uint8_t *mask, int w, int h) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h, blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void aom_highbd_blend_a64_vmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8,
+ uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h, int bd) {
+ typedef void (*blend_fn)(uint16_t * dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int w, int h);
+
+ // Dimensions are: bd_index X width_index
+ static const blend_fn blend[2][2] = {
+ {
+ // bd == 8 or 10
+ blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b10_w4_sse4_1, // w == 4
+ },
+ {
+ // bd == 12
+ blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b12_w4_sse4_1, // w == 4
+ }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8,
+ src1_stride, mask, w, h, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, w, h);
+ }
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/blend_mask_sse4.h b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
new file mode 100644
index 0000000000..c071fdcfc4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_mask_sse4.h
@@ -0,0 +1,237 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/blend.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void blend_a64_d16_mask_w4_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadl_64(src0);
+ const __m128i s1 = xx_loadl_64(src1);
+ const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1);
+ const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m);
+ const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m);
+ const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset);
+ const __m128i res_d = _mm_srai_epi32(res_c, shift);
+ const __m128i res_e = _mm_packs_epi32(res_d, res_d);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ xx_storel_32(dst, res);
+}
+
+static INLINE void blend_a64_d16_mask_w8_sse41(
+ uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1,
+ const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval,
+ int shift) {
+ const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m);
+ const __m128i s0 = xx_loadu_128(src0);
+ const __m128i s1 = xx_loadu_128(src1);
+ __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1),
+ _mm_unpacklo_epi16(*m, max_minus_m));
+ __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1),
+ _mm_unpackhi_epi16(*m, max_minus_m));
+ res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift);
+ res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift);
+ const __m128i res_e = _mm_packs_epi32(res_lo, res_hi);
+ const __m128i res = _mm_packus_epi16(res_e, res_e);
+
+ _mm_storel_epi64((__m128i *)(dst), res);
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_32(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m0 = xx_loadl_64(mask);
+ const __m128i m = _mm_cvtepu8_epi16(m0);
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i two_w = _mm_set1_epi16(2);
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_i1 = xx_loadu_128(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b);
+ const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w);
+ const __m128i m = _mm_srli_epi16(m_acbd_2, 2);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i one_b = _mm_set1_epi8(1);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadu_128(mask);
+ const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b);
+ const __m128i m = _mm_avg_epu16(m_ac, zeros);
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+
+static INLINE void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0,
+ uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride, int h,
+ const __m128i *round_offset, int shift) {
+ const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i zeros = _mm_setzero_si128();
+ for (int i = 0; i < h; ++i) {
+ const __m128i m_i0 = xx_loadl_64(mask);
+ const __m128i m_i1 = xx_loadl_64(mask + mask_stride);
+ const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1);
+ const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros));
+
+ blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval,
+ shift);
+ mask += mask_stride << 1;
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ }
+}
+#endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blend_sse4.h b/third_party/aom/aom_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000000..8d9b325101
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blend_sse4.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_BLEND_SSE4_H_
+#define AOM_AOM_DSP_X86_BLEND_SSE4_H_
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/synonyms.h"
+static const uint8_t g_blend_a64_mask_shuffle[32] = {
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+ 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15,
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_w, const __m128i *v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, *v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, *v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_8_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res_w, v_res_w);
+ return v_res;
+}
+
+static INLINE __m128i blend_16_u8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i *v_m0_b, const __m128i *v_m1_b,
+ const __m128i *rounding) {
+ const __m128i v_s0_b = xx_loadu_128(src0);
+ const __m128i v_s1_b = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_maddubs_epi16(_mm_unpacklo_epi8(v_s0_b, v_s1_b),
+ _mm_unpacklo_epi8(*v_m0_b, *v_m1_b));
+ const __m128i v_p1_w = _mm_maddubs_epi16(_mm_unpackhi_epi8(v_s0_b, v_s1_b),
+ _mm_unpackhi_epi8(*v_m0_b, *v_m1_b));
+
+ const __m128i v_res0_w = _mm_mulhrs_epi16(v_p0_w, *rounding);
+ const __m128i v_res1_w = _mm_mulhrs_epi16(v_p1_w, *rounding);
+ const __m128i v_res = _mm_packus_epi16(v_res0_w, v_res1_w);
+ return v_res;
+}
+
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, AOM_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d =
+ _mm_srli_epi32(v_sum_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d =
+ _mm_srli_epi32(v_suml_d, AOM_BLEND_A64_ROUND_BITS - 1);
+ const __m128i v_ssumh_d =
+ _mm_srli_epi32(v_sumh_d, AOM_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+#endif // AOM_AOM_DSP_X86_BLEND_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
new file mode 100644
index 0000000000..fdf7de3f4c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_avx2.c
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void accumulate_sse_sum(__m256i regx_sum, __m256i regx2_sum,
+ int *x_sum, int64_t *x2_sum) {
+ __m256i sum_buffer, sse_buffer;
+ __m128i out_buffer;
+
+ // Accumulate the various elements of register into first element.
+ sum_buffer = _mm256_permute2f128_si256(regx_sum, regx_sum, 1);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 8));
+ regx_sum = _mm256_add_epi32(regx_sum, _mm256_srli_si256(regx_sum, 4));
+
+ sse_buffer = _mm256_permute2f128_si256(regx2_sum, regx2_sum, 1);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+ regx2_sum = _mm256_add_epi64(regx2_sum, _mm256_srli_si256(regx2_sum, 8));
+
+ out_buffer = _mm256_castsi256_si128(regx_sum);
+ *x_sum += _mm_cvtsi128_si32(out_buffer);
+ out_buffer = _mm256_castsi256_si128(regx2_sum);
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(out_buffer);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, out_buffer);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+static INLINE void sse_sum_wd4_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ __m128i row1, row2, row3;
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int j = 0; j < (bh >> 2); ++j) {
+ // Load 4 rows at a time.
+ row1 = _mm_loadl_epi64((__m128i const *)(data_tmp));
+ row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+ row1 = _mm_unpacklo_epi64(row1, row2);
+ row2 = _mm_loadl_epi64((__m128i const *)(data_tmp + 2 * stride));
+ row3 = _mm_loadl_epi64((__m128i const *)(data_tmp + 3 * stride));
+ row2 = _mm_unpacklo_epi64(row2, row3);
+ load_pixels =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(row1), row2, 1);
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += 4 * stride;
+ }
+
+ // To prevent 32-bit variable overflow, unpack the elements to 64-bit.
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd8_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ __m128i load_128bit, load_next_128bit;
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int j = 0; j < (bh >> 1); ++j) {
+ // Load 2 rows at a time.
+ load_128bit = _mm_loadu_si128((__m128i const *)(data_tmp));
+ load_next_128bit = _mm_loadu_si128((__m128i const *)(data_tmp + stride));
+ load_pixels = _mm256_insertf128_si256(_mm256_castsi128_si256(load_128bit),
+ load_next_128bit, 1);
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += 2 * stride;
+ }
+
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+static INLINE void sse_sum_wd16_avx2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum,
+ int loop_count) {
+ __m256i regx_sum, regx2_sum, load_pixels, sum_buffer, sse_buffer,
+ temp_buffer1, temp_buffer2, row_sum_buffer, row_sse_buffer;
+ const int16_t *data_tmp = data;
+ __m256i one = _mm256_set1_epi16(1);
+ regx_sum = _mm256_setzero_si256();
+ regx2_sum = regx_sum;
+ sum_buffer = _mm256_setzero_si256();
+ sse_buffer = sum_buffer;
+
+ for (int i = 0; i < loop_count; ++i) {
+ data_tmp = data + 16 * i;
+ for (int j = 0; j < bh; ++j) {
+ load_pixels = _mm256_lddqu_si256((__m256i const *)(data_tmp));
+
+ row_sum_buffer = _mm256_madd_epi16(load_pixels, one);
+ row_sse_buffer = _mm256_madd_epi16(load_pixels, load_pixels);
+ sum_buffer = _mm256_add_epi32(row_sum_buffer, sum_buffer);
+ sse_buffer = _mm256_add_epi32(row_sse_buffer, sse_buffer);
+ data_tmp += stride;
+ }
+ }
+
+ temp_buffer1 = _mm256_unpacklo_epi32(sse_buffer, _mm256_setzero_si256());
+ temp_buffer2 = _mm256_unpackhi_epi32(sse_buffer, _mm256_setzero_si256());
+ sse_buffer = _mm256_add_epi64(temp_buffer1, temp_buffer2);
+ regx_sum = _mm256_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm256_add_epi64(sse_buffer, regx2_sum);
+
+ accumulate_sse_sum(regx_sum, regx2_sum, x_sum, x2_sum);
+}
+
+void aom_get_blk_sse_sum_avx2(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
+
+ if ((bh & 3) == 0) {
+ switch (bw) {
+ // For smaller block widths, compute multiple rows simultaneously.
+ case 4: sse_sum_wd4_avx2(data, stride, bh, x_sum, x2_sum); break;
+ case 8: sse_sum_wd8_avx2(data, stride, bh, x_sum, x2_sum); break;
+ case 16:
+ case 32:
+ sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+ break;
+ case 64:
+ // 32-bit variables will overflow for 64 rows at a single time, so
+ // compute 32 rows at a time.
+ if (bh <= 32) {
+ sse_sum_wd16_avx2(data, stride, bh, x_sum, x2_sum, bw >> 4);
+ } else {
+ sse_sum_wd16_avx2(data, stride, 32, x_sum, x2_sum, bw >> 4);
+ sse_sum_wd16_avx2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+ bw >> 4);
+ }
+ break;
+
+ default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+ } else {
+ aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
new file mode 100644
index 0000000000..bf89427872
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/blk_sse_sum_sse2.c
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void sse_sum_wd4_sse2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ const int16_t *data_tmp = data;
+ __m128i temp_buffer1, temp_buffer2;
+ __m128i load_pixels_low, load_pixels_hi, sum_buffer, sse_buffer;
+ __m128i one = _mm_set1_epi16(1);
+ __m128i regx_sum = _mm_setzero_si128();
+ __m128i regx2_sum = regx_sum;
+
+ for (int j = 0; j < (bh >> 1); ++j) {
+ // Load 2 rows (8 pixels) at a time.
+ load_pixels_low = _mm_loadl_epi64((__m128i const *)(data_tmp));
+ load_pixels_hi = _mm_loadl_epi64((__m128i const *)(data_tmp + stride));
+ load_pixels_low = _mm_unpacklo_epi64(load_pixels_low, load_pixels_hi);
+ sum_buffer = _mm_madd_epi16(load_pixels_low, one);
+ sse_buffer = _mm_madd_epi16(load_pixels_low, load_pixels_low);
+ regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+ data_tmp += 2 * stride;
+ }
+
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+ *x_sum = _mm_cvtsi128_si32(regx_sum);
+ temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+ temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+ regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+ regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+static INLINE void sse_sum_wd8_sse2(const int16_t *data, int stride, int bh,
+ int *x_sum, int64_t *x2_sum,
+ int loop_cycles) {
+ const int16_t *data_tmp;
+ __m128i temp_buffer1, temp_buffer2;
+ __m128i one = _mm_set1_epi16(1);
+ __m128i regx_sum = _mm_setzero_si128();
+ __m128i regx2_sum = regx_sum;
+ __m128i load_pixels, sum_buffer, sse_buffer;
+
+ for (int i = 0; i < loop_cycles; ++i) {
+ data_tmp = data + (8 * i);
+ for (int j = 0; j < bh; ++j) {
+ // Load 1 row (8-pixels) at a time.
+ load_pixels = _mm_loadu_si128((__m128i const *)(data_tmp));
+ sum_buffer = _mm_madd_epi16(load_pixels, one);
+ sse_buffer = _mm_madd_epi16(load_pixels, load_pixels);
+ regx_sum = _mm_add_epi32(sum_buffer, regx_sum);
+ regx2_sum = _mm_add_epi32(sse_buffer, regx2_sum);
+ data_tmp += stride;
+ }
+ }
+
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 8));
+ regx_sum = _mm_add_epi32(regx_sum, _mm_srli_si128(regx_sum, 4));
+ *x_sum += _mm_cvtsi128_si32(regx_sum);
+ temp_buffer1 = _mm_unpacklo_epi32(regx2_sum, _mm_setzero_si128());
+ temp_buffer2 = _mm_unpackhi_epi32(regx2_sum, _mm_setzero_si128());
+ regx2_sum = _mm_add_epi64(temp_buffer1, temp_buffer2);
+ regx2_sum = _mm_add_epi64(regx2_sum, _mm_srli_si128(regx2_sum, 8));
+#if AOM_ARCH_X86_64
+ *x2_sum += _mm_cvtsi128_si64(regx2_sum);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, regx2_sum);
+ *x2_sum += tmp;
+ }
+#endif
+}
+
+// This functions adds SSE2 Support for the functions 'get_blk_sse_sum_c'
+void aom_get_blk_sse_sum_sse2(const int16_t *data, int stride, int bw, int bh,
+ int *x_sum, int64_t *x2_sum) {
+ *x_sum = 0;
+ *x2_sum = 0;
+
+ if ((bh & 3) == 0) {
+ switch (bw) {
+ case 4: sse_sum_wd4_sse2(data, stride, bh, x_sum, x2_sum); break;
+ case 8:
+ case 16:
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ // For widths 32 and 64, the registers may overflow. So compute
+ // partial widths at a time.
+ case 32:
+ if (bh <= 32) {
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ } else {
+ sse_sum_wd8_sse2(data, stride, 32, x_sum, x2_sum, bw >> 3);
+ sse_sum_wd8_sse2(data + 32 * stride, stride, 32, x_sum, x2_sum,
+ bw >> 3);
+ break;
+ }
+
+ case 64:
+ if (bh <= 16) {
+ sse_sum_wd8_sse2(data, stride, bh, x_sum, x2_sum, bw >> 3);
+ break;
+ } else {
+ for (int i = 0; i < bh; i += 16)
+ sse_sum_wd8_sse2(data + i * stride, stride, 16, x_sum, x2_sum,
+ bw >> 3);
+ break;
+ }
+
+ default: aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+ } else {
+ aom_get_blk_sse_sum_c(data, stride, bw, bh, x_sum, x2_sum);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/common_avx2.h b/third_party/aom/aom_dsp/x86/common_avx2.h
new file mode 100644
index 0000000000..96fe4ebb67
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/common_avx2.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_COMMON_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+// Note: in and out could have the same value
+static INLINE void mm256_transpose_16x16(const __m256i *in, __m256i *out) {
+ __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ __m256i tr0_1 = _mm256_unpackhi_epi16(in[0], in[1]);
+ __m256i tr0_2 = _mm256_unpacklo_epi16(in[2], in[3]);
+ __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
+ __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
+ __m256i tr0_5 = _mm256_unpackhi_epi16(in[4], in[5]);
+ __m256i tr0_6 = _mm256_unpacklo_epi16(in[6], in[7]);
+ __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ __m256i tr0_8 = _mm256_unpacklo_epi16(in[8], in[9]);
+ __m256i tr0_9 = _mm256_unpackhi_epi16(in[8], in[9]);
+ __m256i tr0_a = _mm256_unpacklo_epi16(in[10], in[11]);
+ __m256i tr0_b = _mm256_unpackhi_epi16(in[10], in[11]);
+ __m256i tr0_c = _mm256_unpacklo_epi16(in[12], in[13]);
+ __m256i tr0_d = _mm256_unpackhi_epi16(in[12], in[13]);
+ __m256i tr0_e = _mm256_unpacklo_epi16(in[14], in[15]);
+ __m256i tr0_f = _mm256_unpackhi_epi16(in[14], in[15]);
+
+ // 00 10 01 11 02 12 03 13 08 18 09 19 0a 1a 0b 1b
+ // 04 14 05 15 06 16 07 17 0c 1c 0d 1d 0e 1e 0f 1f
+ // 20 30 21 31 22 32 23 33 28 38 29 39 2a 3a 2b 3b
+ // 24 34 25 35 26 36 27 37 2c 3c 2d 3d 2e 3e 2f 3f
+ // 40 50 41 51 42 52 43 53 48 58 49 59 4a 5a 4b 5b
+ // 44 54 45 55 46 56 47 57 4c 5c 4d 5d 4e 5e 4f 5f
+ // 60 70 61 71 62 72 63 73 68 78 69 79 6a 7a 6b 7b
+ // 64 74 65 75 66 76 67 77 6c 7c 6d 7d 6e 7e 6f 7f
+
+ // 80 90 81 91 82 92 83 93 88 98 89 99 8a 9a 8b 9b
+ // 84 94 85 95 86 96 87 97 8c 9c 8d 9d 8e 9e 8f 9f
+ // a0 b0 a1 b1 a2 b2 a3 b3 a8 b8 a9 b9 aa ba ab bb
+ // a4 b4 a5 b5 a6 b6 a7 b7 ac bc ad bd ae be af bf
+ // c0 d0 c1 d1 c2 d2 c3 d3 c8 d8 c9 d9 ca da cb db
+ // c4 d4 c5 d5 c6 d6 c7 d7 cc dc cd dd ce de cf df
+ // e0 f0 e1 f1 e2 f2 e3 f3 e8 f8 e9 f9 ea fa eb fb
+ // e4 f4 e5 f5 e6 f6 e7 f7 ec fc ed fd ee fe ef ff
+
+ __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_2);
+ __m256i tr1_1 = _mm256_unpackhi_epi32(tr0_0, tr0_2);
+ __m256i tr1_2 = _mm256_unpacklo_epi32(tr0_1, tr0_3);
+ __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_1, tr0_3);
+ __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_4, tr0_6);
+ __m256i tr1_5 = _mm256_unpackhi_epi32(tr0_4, tr0_6);
+ __m256i tr1_6 = _mm256_unpacklo_epi32(tr0_5, tr0_7);
+ __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_5, tr0_7);
+
+ __m256i tr1_8 = _mm256_unpacklo_epi32(tr0_8, tr0_a);
+ __m256i tr1_9 = _mm256_unpackhi_epi32(tr0_8, tr0_a);
+ __m256i tr1_a = _mm256_unpacklo_epi32(tr0_9, tr0_b);
+ __m256i tr1_b = _mm256_unpackhi_epi32(tr0_9, tr0_b);
+ __m256i tr1_c = _mm256_unpacklo_epi32(tr0_c, tr0_e);
+ __m256i tr1_d = _mm256_unpackhi_epi32(tr0_c, tr0_e);
+ __m256i tr1_e = _mm256_unpacklo_epi32(tr0_d, tr0_f);
+ __m256i tr1_f = _mm256_unpackhi_epi32(tr0_d, tr0_f);
+
+ // 00 10 20 30 01 11 21 31 08 18 28 38 09 19 29 39
+ // 02 12 22 32 03 13 23 33 0a 1a 2a 3a 0b 1b 2b 3b
+ // 04 14 24 34 05 15 25 35 0c 1c 2c 3c 0d 1d 2d 3d
+ // 06 16 26 36 07 17 27 37 0e 1e 2e 3e 0f 1f 2f 3f
+ // 40 50 60 70 41 51 61 71 48 58 68 78 49 59 69 79
+ // 42 52 62 72 43 53 63 73 4a 5a 6a 7a 4b 5b 6b 7b
+ // 44 54 64 74 45 55 65 75 4c 5c 6c 7c 4d 5d 6d 7d
+ // 46 56 66 76 47 57 67 77 4e 5e 6e 7e 4f 5f 6f 7f
+
+ // 80 90 a0 b0 81 91 a1 b1 88 98 a8 b8 89 99 a9 b9
+ // 82 92 a2 b2 83 93 a3 b3 8a 9a aa ba 8b 9b ab bb
+ // 84 94 a4 b4 85 95 a5 b5 8c 9c ac bc 8d 9d ad bd
+ // 86 96 a6 b6 87 97 a7 b7 8e ae 9e be 8f 9f af bf
+ // c0 d0 e0 f0 c1 d1 e1 f1 c8 d8 e8 f8 c9 d9 e9 f9
+ // c2 d2 e2 f2 c3 d3 e3 f3 ca da ea fa cb db eb fb
+ // c4 d4 e4 f4 c5 d5 e5 f5 cc dc ef fc cd dd ed fd
+ // c6 d6 e6 f6 c7 d7 e7 f7 ce de ee fe cf df ef ff
+
+ tr0_0 = _mm256_unpacklo_epi64(tr1_0, tr1_4);
+ tr0_1 = _mm256_unpackhi_epi64(tr1_0, tr1_4);
+ tr0_2 = _mm256_unpacklo_epi64(tr1_1, tr1_5);
+ tr0_3 = _mm256_unpackhi_epi64(tr1_1, tr1_5);
+ tr0_4 = _mm256_unpacklo_epi64(tr1_2, tr1_6);
+ tr0_5 = _mm256_unpackhi_epi64(tr1_2, tr1_6);
+ tr0_6 = _mm256_unpacklo_epi64(tr1_3, tr1_7);
+ tr0_7 = _mm256_unpackhi_epi64(tr1_3, tr1_7);
+
+ tr0_8 = _mm256_unpacklo_epi64(tr1_8, tr1_c);
+ tr0_9 = _mm256_unpackhi_epi64(tr1_8, tr1_c);
+ tr0_a = _mm256_unpacklo_epi64(tr1_9, tr1_d);
+ tr0_b = _mm256_unpackhi_epi64(tr1_9, tr1_d);
+ tr0_c = _mm256_unpacklo_epi64(tr1_a, tr1_e);
+ tr0_d = _mm256_unpackhi_epi64(tr1_a, tr1_e);
+ tr0_e = _mm256_unpacklo_epi64(tr1_b, tr1_f);
+ tr0_f = _mm256_unpackhi_epi64(tr1_b, tr1_f);
+
+ // 00 10 20 30 40 50 60 70 08 18 28 38 48 58 68 78
+ // 01 11 21 31 41 51 61 71 09 19 29 39 49 59 69 79
+ // 02 12 22 32 42 52 62 72 0a 1a 2a 3a 4a 5a 6a 7a
+ // 03 13 23 33 43 53 63 73 0b 1b 2b 3b 4b 5b 6b 7b
+ // 04 14 24 34 44 54 64 74 0c 1c 2c 3c 4c 5c 6c 7c
+ // 05 15 25 35 45 55 65 75 0d 1d 2d 3d 4d 5d 6d 7d
+ // 06 16 26 36 46 56 66 76 0e 1e 2e 3e 4e 5e 6e 7e
+ // 07 17 27 37 47 57 67 77 0f 1f 2f 3f 4f 5f 6f 7f
+
+ // 80 90 a0 b0 c0 d0 e0 f0 88 98 a8 b8 c8 d8 e8 f8
+ // 81 91 a1 b1 c1 d1 e1 f1 89 99 a9 b9 c9 d9 e9 f9
+ // 82 92 a2 b2 c2 d2 e2 f2 8a 9a aa ba ca da ea fa
+ // 83 93 a3 b3 c3 d3 e3 f3 8b 9b ab bb cb db eb fb
+ // 84 94 a4 b4 c4 d4 e4 f4 8c 9c ac bc cc dc ef fc
+ // 85 95 a5 b5 c5 d5 e5 f5 8d 9d ad bd cd dd ed fd
+ // 86 96 a6 b6 c6 d6 e6 f6 8e ae 9e be ce de ee fe
+ // 87 97 a7 b7 c7 d7 e7 f7 8f 9f af bf cf df ef ff
+
+ out[0] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x20); // 0010 0000
+ out[8] = _mm256_permute2x128_si256(tr0_0, tr0_8, 0x31); // 0011 0001
+ out[1] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x20);
+ out[9] = _mm256_permute2x128_si256(tr0_1, tr0_9, 0x31);
+ out[2] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x20);
+ out[10] = _mm256_permute2x128_si256(tr0_2, tr0_a, 0x31);
+ out[3] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x20);
+ out[11] = _mm256_permute2x128_si256(tr0_3, tr0_b, 0x31);
+
+ out[4] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x20);
+ out[12] = _mm256_permute2x128_si256(tr0_4, tr0_c, 0x31);
+ out[5] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x20);
+ out[13] = _mm256_permute2x128_si256(tr0_5, tr0_d, 0x31);
+ out[6] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x20);
+ out[14] = _mm256_permute2x128_si256(tr0_6, tr0_e, 0x31);
+ out[7] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x20);
+ out[15] = _mm256_permute2x128_si256(tr0_7, tr0_f, 0x31);
+}
+#endif // AOM_AOM_DSP_X86_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve.h b/third_party/aom/aom_dsp/x86/convolve.h
new file mode 100644
index 0000000000..4ca214f469
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve.h
@@ -0,0 +1,204 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_ports/mem.h"
+
+typedef void filter8_1dfunction(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+ uint8_t *output_ptr, ptrdiff_t out_pitch,
+ uint32_t output_height, const int16_t *filter);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_convolve8_##name##_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h) { \
+ (void)filter_x; \
+ (void)x_step_q4; \
+ (void)filter_y; \
+ (void)y_step_q4; \
+ assert((-128 <= filter[3]) && (filter[3] <= 127)); \
+ assert(step_q4 == 16); \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##4_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_filter_block1d16_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_filter_block1d8_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_filter_block1d4_##dir##2_##avg##opt(src, src_stride, dst, \
+ dst_stride, h, filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ if (w) { \
+ aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
+ x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+ }
+
+#if CONFIG_AV1_HIGHBITDEPTH
+typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter, int bd);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void aom_highbd_convolve8_##name##_##opt( \
+ const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, \
+ ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (((filter[0] | filter[1] | filter[6] | filter[7]) == 0) && \
+ (filter[2] | filter[5])) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##4_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else if (filter[0] | filter[1] | filter[2]) { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##8_##avg##opt( \
+ src_start, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ aom_highbd_filter_block1d16_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ aom_highbd_filter_block1d8_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ aom_highbd_filter_block1d4_##dir##2_##avg##opt( \
+ src, src_stride, dst, dst_stride, h, filter, bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ aom_highbd_convolve8_##name##_c( \
+ CONVERT_TO_BYTEPTR(src), src_stride, CONVERT_TO_BYTEPTR(dst), \
+ dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); \
+ } \
+ }
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_avx2.h b/third_party/aom/aom_dsp/x86/convolve_avx2.h
new file mode 100644
index 0000000000..f5a382ce4e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_avx2.h
@@ -0,0 +1,922 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
+
+#include <immintrin.h>
+
+#include "aom_ports/mem.h"
+
+#include "av1/common/convolve.h"
+#include "av1/common/filter.h"
+
+// filters for 16
+DECLARE_ALIGNED(32, static const uint8_t, filt_global_avx2[]) = {
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
+ 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
+ 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[]) = {
+ 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 0, 1, 2, 3, 1, 2,
+ 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9,
+ 7, 8, 9, 10, 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_d4_global_avx2[]) = {
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+ 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_center_global_avx2[32]) = {
+ 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255,
+ 3, 255, 4, 255, 5, 255, 6, 255, 7, 255, 8, 255, 9, 255, 10, 255
+};
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt1_global_avx2[32]) = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5,
+ 6, 6, 7, 7, 8, 0, 1, 1, 2, 2, 3,
+ 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+
+DECLARE_ALIGNED(32, static const uint8_t,
+ filt2_global_avx2[32]) = { 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7,
+ 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5,
+ 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 };
+
+DECLARE_ALIGNED(32, static const uint8_t, filt3_global_avx2[32]) = {
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12,
+ 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12
+};
+
+DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14,
+ 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
+};
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_4TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ __m256i res = convolve_lowbd_x_4tap(data, coeffs_h + 1, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ __m256i res = convolve_lowbd_x_4tap(data_1, coeffs_h + 1, filt); \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_4TAP \
+ __m256i s[6]; \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ const __m256i s4 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+ const __m256i s5 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ s[5] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ __m256i res_a = convolve_4tap(s, coeffs_v + 1); \
+ __m256i res_b = convolve_4tap(s + 3, coeffs_v + 1); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_6TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ \
+ __m256i res = convolve_lowbd_x_6tap(data, coeffs_h, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ \
+ __m256i res = convolve_lowbd_x_6tap(data_1, coeffs_h, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_6TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ \
+ __m256i s[8]; \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ \
+ s[3] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[4] = _mm256_unpackhi_epi16(src_2, src_3); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 4 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 5 * im_stride)); \
+ \
+ s[2] = _mm256_unpacklo_epi16(s6, s7); \
+ s[5] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve_6tap(s, coeffs_v); \
+ __m256i res_b = convolve_6tap(s + 3, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_8TAP \
+ for (i = 0; i < (im_h - 2); i += 2) { \
+ __m256i data = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ data = _mm256_inserti128_si256( \
+ data, \
+ _mm_loadu_si128( \
+ (__m128i *)&src_ptr[(i * src_stride) + j + src_stride]), \
+ 1); \
+ \
+ __m256i res = convolve_lowbd_x(data, coeffs_h, filt); \
+ res = \
+ _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ \
+ __m256i data_1 = _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + j])); \
+ \
+ __m256i res = convolve_lowbd_x(data_1, coeffs_h, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
+
+#define CONVOLVE_SR_VERTICAL_FILTER_8TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ __m256i s[8]; \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[5] = _mm256_unpackhi_epi16(src_2, src_3); \
+ s[6] = _mm256_unpackhi_epi16(src_4, src_5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve(s, coeffs_v); \
+ __m256i res_b = convolve(s + 4, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ }
+
+#define CONVOLVE_SR_HORIZONTAL_FILTER_12TAP \
+ const __m256i v_zero = _mm256_setzero_si256(); \
+ __m256i s[12]; \
+ if (w <= 4) { \
+ for (i = 0; i < im_h; i += 2) { \
+ const __m256i data = _mm256_permute2x128_si256( \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \
+ _mm256_castsi128_si256(_mm_loadu_si128( \
+ (__m128i *)(&src_ptr[i * src_stride + src_stride + j]))), \
+ 0x20); \
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \
+ \
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \
+ \
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \
+ \
+ const __m256i res_lo = convolve_12taps(s, coeffs_h); \
+ \
+ __m256i res_32b_lo = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \
+ const __m128i res_0 = _mm256_extracti128_si256(res_16b_lo, 0); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_16b_lo, 1); \
+ if (w > 2) { \
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride], res_0); \
+ _mm_storel_epi64((__m128i *)&im_block[i * im_stride + im_stride], \
+ res_1); \
+ } else { \
+ uint32_t horiz_2; \
+ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_0); \
+ im_block[i * im_stride] = (uint16_t)horiz_2; \
+ im_block[i * im_stride + 1] = (uint16_t)(horiz_2 >> 16); \
+ horiz_2 = (uint32_t)_mm_cvtsi128_si32(res_1); \
+ im_block[i * im_stride + im_stride] = (uint16_t)horiz_2; \
+ im_block[i * im_stride + im_stride + 1] = (uint16_t)(horiz_2 >> 16); \
+ } \
+ } \
+ } else { \
+ for (i = 0; i < im_h; i++) { \
+ const __m256i data = _mm256_permute2x128_si256( \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), \
+ _mm256_castsi128_si256( \
+ _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j + 4]))), \
+ 0x20); \
+ const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); \
+ const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); \
+ \
+ const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); \
+ const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); \
+ \
+ const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); \
+ const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); \
+ \
+ s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); \
+ s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); \
+ s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); \
+ s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); \
+ s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); \
+ s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); \
+ \
+ const __m256i res_lo = convolve_12taps(s, coeffs_h); \
+ \
+ __m256i res_32b_lo = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_lo, round_const_h12), round_shift_h12); \
+ \
+ __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); \
+ _mm_store_si128((__m128i *)&im_block[i * im_stride], \
+ _mm256_extracti128_si256( \
+ _mm256_permute4x64_epi64(res_16b_lo, 0x88), 0)); \
+ } \
+ }
+
+#define CONVOLVE_SR_VERTICAL_FILTER_12TAP \
+ __m256i src_0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i src_1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i src_2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i src_3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i src_4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i src_5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ __m256i src_6 = _mm256_loadu_si256((__m256i *)(im_block + 6 * im_stride)); \
+ __m256i src_7 = _mm256_loadu_si256((__m256i *)(im_block + 7 * im_stride)); \
+ __m256i src_8 = _mm256_loadu_si256((__m256i *)(im_block + 8 * im_stride)); \
+ __m256i src_9 = _mm256_loadu_si256((__m256i *)(im_block + 9 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(src_0, src_1); \
+ s[1] = _mm256_unpacklo_epi16(src_2, src_3); \
+ s[2] = _mm256_unpacklo_epi16(src_4, src_5); \
+ s[3] = _mm256_unpacklo_epi16(src_6, src_7); \
+ s[4] = _mm256_unpacklo_epi16(src_8, src_9); \
+ \
+ s[6] = _mm256_unpackhi_epi16(src_0, src_1); \
+ s[7] = _mm256_unpackhi_epi16(src_2, src_3); \
+ s[8] = _mm256_unpackhi_epi16(src_4, src_5); \
+ s[9] = _mm256_unpackhi_epi16(src_6, src_7); \
+ s[10] = _mm256_unpackhi_epi16(src_8, src_9); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = _mm256_loadu_si256((__m256i *)(data + 10 * im_stride)); \
+ const __m256i s7 = _mm256_loadu_si256((__m256i *)(data + 11 * im_stride)); \
+ \
+ s[5] = _mm256_unpacklo_epi16(s6, s7); \
+ s[11] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ __m256i res_a = convolve_12taps(s, coeffs_v); \
+ __m256i res_b = convolve_12taps(s + 6, coeffs_v); \
+ \
+ res_a = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_a, sum_round_v), sum_shift_v); \
+ res_b = \
+ _mm256_sra_epi32(_mm256_add_epi32(res_b, sum_round_v), sum_shift_v); \
+ \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ \
+ const __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_8b = _mm256_packus_epi16(res_16bit, res_16bit); \
+ \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8b); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); \
+ \
+ __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; \
+ __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + j + dst_stride]; \
+ if (w - j > 4) { \
+ _mm_storel_epi64(p_0, res_0); \
+ _mm_storel_epi64(p_1, res_1); \
+ } else if (w == 4) { \
+ xx_storel_32(p_0, res_0); \
+ xx_storel_32(p_1, res_1); \
+ } else { \
+ *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); \
+ *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ s[3] = s[4]; \
+ s[4] = s[5]; \
+ \
+ s[6] = s[7]; \
+ s[7] = s[8]; \
+ s[8] = s[9]; \
+ s[9] = s[10]; \
+ s[10] = s[11]; \
+ }
+
+#define DIST_WTD_CONVOLVE_HORIZONTAL_FILTER_8TAP \
+ do { \
+ for (i = 0; i < im_h; i += 2) { \
+ __m256i data = \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)src_h)); \
+ if (i + 1 < im_h) \
+ data = _mm256_inserti128_si256( \
+ data, _mm_loadu_si128((__m128i *)(src_h + src_stride)), 1); \
+ src_h += (src_stride << 1); \
+ __m256i res = convolve_lowbd_x(data, coeffs_x, filt); \
+ \
+ res = _mm256_sra_epi16(_mm256_add_epi16(res, round_const_h), \
+ round_shift_h); \
+ \
+ _mm256_store_si256((__m256i *)&im_block[i * im_stride], res); \
+ } \
+ } while (0)
+
+#define DIST_WTD_CONVOLVE_VERTICAL_FILTER_8TAP \
+ do { \
+ __m256i s[8]; \
+ __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride)); \
+ __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride)); \
+ __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride)); \
+ __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride)); \
+ __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride)); \
+ __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride)); \
+ \
+ s[0] = _mm256_unpacklo_epi16(s0, s1); \
+ s[1] = _mm256_unpacklo_epi16(s2, s3); \
+ s[2] = _mm256_unpacklo_epi16(s4, s5); \
+ \
+ s[4] = _mm256_unpackhi_epi16(s0, s1); \
+ s[5] = _mm256_unpackhi_epi16(s2, s3); \
+ s[6] = _mm256_unpackhi_epi16(s4, s5); \
+ \
+ for (i = 0; i < h; i += 2) { \
+ const int16_t *data = &im_block[i * im_stride]; \
+ \
+ const __m256i s6 = \
+ _mm256_loadu_si256((__m256i *)(data + 6 * im_stride)); \
+ const __m256i s7 = \
+ _mm256_loadu_si256((__m256i *)(data + 7 * im_stride)); \
+ \
+ s[3] = _mm256_unpacklo_epi16(s6, s7); \
+ s[7] = _mm256_unpackhi_epi16(s6, s7); \
+ \
+ const __m256i res_a = convolve(s, coeffs_y); \
+ const __m256i res_a_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_a, round_const_v), round_shift_v); \
+ \
+ if (w - j > 4) { \
+ const __m256i res_b = convolve(s + 4, coeffs_y); \
+ const __m256i res_b_round = _mm256_sra_epi32( \
+ _mm256_add_epi32(res_b, round_const_v), round_shift_v); \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_b_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = \
+ load_line2_avx2(&dst[i * dst_stride + j], \
+ &dst[i * dst_stride + j + dst_stride]); \
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
+ &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = \
+ _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0); \
+ _mm_storel_epi64( \
+ (__m128i *)((&dst0[i * dst_stride0 + j + dst_stride0])), res_1); \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } else { \
+ const __m256i res_16b = _mm256_packs_epi32(res_a_round, res_a_round); \
+ const __m256i res_unsigned = _mm256_add_epi16(res_16b, offset_const); \
+ \
+ if (do_average) { \
+ const __m256i data_ref_0 = \
+ load_line2_avx2(&dst[i * dst_stride + j], \
+ &dst[i * dst_stride + j + dst_stride]); \
+ \
+ const __m256i comp_avg_res = comp_avg(&data_ref_0, &res_unsigned, \
+ &wt, use_dist_wtd_comp_avg); \
+ \
+ const __m256i round_result = convolve_rounding( \
+ &comp_avg_res, &offset_const, &rounding_const, rounding_shift); \
+ \
+ const __m256i res_8 = \
+ _mm256_packus_epi16(round_result, round_result); \
+ const __m128i res_0 = _mm256_castsi256_si128(res_8); \
+ const __m128i res_1 = _mm256_extracti128_si256(res_8, 1); \
+ \
+ *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_0); \
+ *(int *)(&dst0[i * dst_stride0 + j + dst_stride0]) = \
+ _mm_cvtsi128_si32(res_1); \
+ \
+ } else { \
+ const __m128i res_0 = _mm256_castsi256_si128(res_unsigned); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0); \
+ \
+ const __m128i res_1 = _mm256_extracti128_si256(res_unsigned, 1); \
+ _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]), \
+ res_1); \
+ } \
+ } \
+ \
+ s[0] = s[1]; \
+ s[1] = s[2]; \
+ s[2] = s[3]; \
+ \
+ s[4] = s[5]; \
+ s[5] = s[6]; \
+ s[6] = s[7]; \
+ } \
+ } while (0)
+
+static INLINE void prepare_coeffs_lowbd(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((short)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0200u));
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0604u));
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0a08u));
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0e0cu));
+}
+
+static INLINE void prepare_coeffs_6t_lowbd(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeffs_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i filter_coeffs = _mm256_broadcastsi128_si256(coeffs_8);
+
+ // right shift all filter co-efficients by 1 to reduce the bits required.
+ // This extra right shift will be taken care of at the end while rounding
+ // the result.
+ // Since all filter co-efficients are even, this change will not affect the
+ // end result
+ assert(_mm_test_all_zeros(_mm_and_si128(coeffs_8, _mm_set1_epi16(1)),
+ _mm_set1_epi16((int16_t)0xffff)));
+
+ const __m256i coeffs_1 = _mm256_srai_epi16(filter_coeffs, 1);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0402u));
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0806u));
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi8(coeffs_1, _mm256_set1_epi16(0x0c0au));
+}
+
+static INLINE void prepare_coeffs_6t(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)(filter + 1));
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 1 2 1 2 1 2 1 2
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 3 4 3 4 3 4 3 4
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 5 6 5 6 5 6 5 6
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ const __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE void prepare_coeffs_12taps(
+ const InterpFilterParams *const filter_params, const int subpel_q4,
+ __m256i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ __m128i coeff_8 = _mm_loadu_si128((__m128i *)filter);
+ __m256i coeff = _mm256_broadcastsi128_si256(coeff_8);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm256_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm256_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm256_shuffle_epi32(coeff, 0xff);
+ // coeffs 8 9 10 11 0 0 0 0
+ coeff_8 = _mm_loadl_epi64((__m128i *)(filter + 8));
+ coeff = _mm256_broadcastq_epi64(coeff_8);
+ coeffs[4] = _mm256_shuffle_epi32(coeff, 0x00); // coeffs 8 9 8 9 8 9 8 9
+ coeffs[5] = _mm256_shuffle_epi32(coeff, 0x55); // coeffs 10 11 10 11.. 10 11
+}
+
+static INLINE __m256i convolve_lowbd(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+ const __m256i res_67 = _mm256_maddubs_epi16(s[3], coeffs[3]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res = _mm256_add_epi16(_mm256_add_epi16(res_01, res_45),
+ _mm256_add_epi16(res_23, res_67));
+
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_6tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_01 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_23 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[2], coeffs[2]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res =
+ _mm256_add_epi16(_mm256_add_epi16(res_01, res_45), res_23);
+
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_4tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_23 = _mm256_maddubs_epi16(s[0], coeffs[0]);
+ const __m256i res_45 = _mm256_maddubs_epi16(s[1], coeffs[1]);
+
+ // order: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+ const __m256i res = _mm256_add_epi16(res_45, res_23);
+
+ return res;
+}
+
+static INLINE __m256i convolve_6tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1), res_2);
+
+ return res;
+}
+
+static INLINE __m256i convolve_12taps(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+ const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+ const __m256i res_4 = _mm256_madd_epi16(s[4], coeffs[4]);
+ const __m256i res_5 = _mm256_madd_epi16(s[5], coeffs[5]);
+
+ const __m256i res1 = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+ _mm256_add_epi32(res_2, res_3));
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_4, res_5), res1);
+
+ return res;
+}
+
+static INLINE __m256i convolve(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_0 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_1 = _mm256_madd_epi16(s[1], coeffs[1]);
+ const __m256i res_2 = _mm256_madd_epi16(s[2], coeffs[2]);
+ const __m256i res_3 = _mm256_madd_epi16(s[3], coeffs[3]);
+
+ const __m256i res = _mm256_add_epi32(_mm256_add_epi32(res_0, res_1),
+ _mm256_add_epi32(res_2, res_3));
+
+ return res;
+}
+
+static INLINE __m256i convolve_4tap(const __m256i *const s,
+ const __m256i *const coeffs) {
+ const __m256i res_1 = _mm256_madd_epi16(s[0], coeffs[0]);
+ const __m256i res_2 = _mm256_madd_epi16(s[1], coeffs[1]);
+
+ const __m256i res = _mm256_add_epi32(res_1, res_2);
+ return res;
+}
+
+static INLINE __m256i convolve_lowbd_x(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[4];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+ s[2] = _mm256_shuffle_epi8(data, filt[2]);
+ s[3] = _mm256_shuffle_epi8(data, filt[3]);
+
+ return convolve_lowbd(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_6tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[4];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+ s[2] = _mm256_shuffle_epi8(data, filt[2]);
+
+ return convolve_lowbd_6tap(s, coeffs);
+}
+
+static INLINE __m256i convolve_lowbd_x_4tap(const __m256i data,
+ const __m256i *const coeffs,
+ const __m256i *const filt) {
+ __m256i s[2];
+
+ s[0] = _mm256_shuffle_epi8(data, filt[0]);
+ s[1] = _mm256_shuffle_epi8(data, filt[1]);
+
+ return convolve_lowbd_4tap(s, coeffs);
+}
+
+static INLINE void add_store_aligned_256(CONV_BUF_TYPE *const dst,
+ const __m256i *const res,
+ const int do_average) {
+ __m256i d;
+ if (do_average) {
+ d = _mm256_load_si256((__m256i *)dst);
+ d = _mm256_add_epi32(d, *res);
+ d = _mm256_srai_epi32(d, 1);
+ } else {
+ d = *res;
+ }
+ _mm256_store_si256((__m256i *)dst, d);
+}
+
+static INLINE __m256i comp_avg(const __m256i *const data_ref_0,
+ const __m256i *const res_unsigned,
+ const __m256i *const wt,
+ const int use_dist_wtd_comp_avg) {
+ __m256i res;
+ if (use_dist_wtd_comp_avg) {
+ const __m256i data_lo = _mm256_unpacklo_epi16(*data_ref_0, *res_unsigned);
+ const __m256i data_hi = _mm256_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+ const __m256i wt_res_lo = _mm256_madd_epi16(data_lo, *wt);
+ const __m256i wt_res_hi = _mm256_madd_epi16(data_hi, *wt);
+
+ const __m256i res_lo = _mm256_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ const __m256i res_hi = _mm256_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+ res = _mm256_packs_epi32(res_lo, res_hi);
+ } else {
+ const __m256i wt_res = _mm256_add_epi16(*data_ref_0, *res_unsigned);
+ res = _mm256_srai_epi16(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m256i convolve_rounding(const __m256i *const res_unsigned,
+ const __m256i *const offset_const,
+ const __m256i *const round_const,
+ const int round_shift) {
+ const __m256i res_signed = _mm256_sub_epi16(*res_unsigned, *offset_const);
+ const __m256i res_round = _mm256_srai_epi16(
+ _mm256_add_epi16(res_signed, *round_const), round_shift);
+ return res_round;
+}
+
+static INLINE __m256i highbd_comp_avg(const __m256i *const data_ref_0,
+ const __m256i *const res_unsigned,
+ const __m256i *const wt0,
+ const __m256i *const wt1,
+ const int use_dist_wtd_comp_avg) {
+ __m256i res;
+ if (use_dist_wtd_comp_avg) {
+ const __m256i wt0_res = _mm256_mullo_epi32(*data_ref_0, *wt0);
+ const __m256i wt1_res = _mm256_mullo_epi32(*res_unsigned, *wt1);
+ const __m256i wt_res = _mm256_add_epi32(wt0_res, wt1_res);
+ res = _mm256_srai_epi32(wt_res, DIST_PRECISION_BITS);
+ } else {
+ const __m256i wt_res = _mm256_add_epi32(*data_ref_0, *res_unsigned);
+ res = _mm256_srai_epi32(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m256i highbd_convolve_rounding(
+ const __m256i *const res_unsigned, const __m256i *const offset_const,
+ const __m256i *const round_const, const int round_shift) {
+ const __m256i res_signed = _mm256_sub_epi32(*res_unsigned, *offset_const);
+ const __m256i res_round = _mm256_srai_epi32(
+ _mm256_add_epi32(res_signed, *round_const), round_shift);
+
+ return res_round;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_common_intrin.h b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
new file mode 100644
index 0000000000..9e8662af46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_common_intrin.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void add_store(CONV_BUF_TYPE *const dst, const __m128i *const res,
+ const int do_average) {
+ __m128i d;
+ if (do_average) {
+ d = _mm_load_si128((__m128i *)dst);
+ d = _mm_add_epi32(d, *res);
+ d = _mm_srai_epi32(d, 1);
+ } else {
+ d = *res;
+ }
+ _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE void prepare_coeffs_12tap(const InterpFilterParams *filter_params,
+ int subpel_q4,
+ __m128i *coeffs /* [6] */) {
+ const int16_t *const y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+
+ __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ coeffs[0] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 0 1 0 1 0 1 0 1
+ coeffs[1] = _mm_shuffle_epi32(coeffs_y, 85); // coeffs 2 3 2 3 2 3 2 3
+ coeffs[2] = _mm_shuffle_epi32(coeffs_y, 170); // coeffs 4 5 4 5 4 5 4 5
+ coeffs[3] = _mm_shuffle_epi32(coeffs_y, 255); // coeffs 6 7 6 7 6 7 6 7
+
+ coeffs_y = _mm_loadl_epi64((__m128i *)(y_filter + 8));
+
+ coeffs[4] = _mm_shuffle_epi32(coeffs_y, 0); // coeffs 8 9 8 9 8 9 8 9
+ coeffs[5] =
+ _mm_shuffle_epi32(coeffs_y, 85); // coeffs 10 11 10 11 10 11 10 11
+}
+
+static INLINE __m128i convolve_12tap(const __m128i *s, const __m128i *coeffs) {
+ const __m128i d0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i d1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i d2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i d3 = _mm_madd_epi16(s[3], coeffs[3]);
+ const __m128i d4 = _mm_madd_epi16(s[4], coeffs[4]);
+ const __m128i d5 = _mm_madd_epi16(s[5], coeffs[5]);
+ const __m128i d_0123 =
+ _mm_add_epi32(_mm_add_epi32(d0, d1), _mm_add_epi32(d2, d3));
+ const __m128i d = _mm_add_epi32(_mm_add_epi32(d4, d5), d_0123);
+ return d;
+}
+
+static INLINE __m128i convolve_lo_x_12tap(const __m128i *s,
+ const __m128i *coeffs,
+ const __m128i zero) {
+ __m128i ss[6];
+ ss[0] = _mm_unpacklo_epi8(s[0], zero); // 0 1 1 2 2 3 3 4
+ ss[1] = _mm_unpacklo_epi8(s[1], zero); // 2 3 3 4 4 5 5 6
+ ss[2] = _mm_unpacklo_epi8(s[2], zero); // 4 5 5 6 6 7 7 8
+ ss[3] = _mm_unpacklo_epi8(s[3], zero); // 6 7 7 8 8 9 9 10
+ ss[4] = _mm_unpackhi_epi8(s[2], zero); // 8 9 9 10 10 11 11 12
+ ss[5] = _mm_unpackhi_epi8(s[3], zero); // 10 11 11 12 12 13 13 14
+ return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpacklo_epi8(s[0], zero);
+ ss[1] = _mm_unpacklo_epi8(s[2], zero);
+ ss[2] = _mm_unpacklo_epi8(s[4], zero);
+ ss[3] = _mm_unpacklo_epi8(s[6], zero);
+ ss[4] = _mm_unpacklo_epi8(s[8], zero);
+ ss[5] = _mm_unpacklo_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+ const __m128i *coeffs) {
+ __m128i ss[6];
+ const __m128i zero = _mm_setzero_si128();
+ ss[0] = _mm_unpackhi_epi8(s[0], zero);
+ ss[1] = _mm_unpackhi_epi8(s[2], zero);
+ ss[2] = _mm_unpackhi_epi8(s[4], zero);
+ ss[3] = _mm_unpackhi_epi8(s[6], zero);
+ ss[4] = _mm_unpackhi_epi8(s[8], zero);
+ ss[5] = _mm_unpackhi_epi8(s[10], zero);
+ return convolve_12tap(ss, coeffs);
+}
+#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse2.h b/third_party/aom/aom_dsp/x86/convolve_sse2.h
new file mode 100644
index 0000000000..36b7d62b98
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse2.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
+
+#include "config/aom_scale_rtcd.h"
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+static INLINE void prepare_coeffs(const InterpFilterParams *const filter_params,
+ const int subpel_q4,
+ __m128i *const coeffs /* [4] */) {
+ const int16_t *filter = av1_get_interp_filter_subpel_kernel(
+ filter_params, subpel_q4 & SUBPEL_MASK);
+ const __m128i coeff = _mm_loadu_si128((__m128i *)filter);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ coeffs[0] = _mm_shuffle_epi32(coeff, 0x00);
+ // coeffs 2 3 2 3 2 3 2 3
+ coeffs[1] = _mm_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ coeffs[2] = _mm_shuffle_epi32(coeff, 0xaa);
+ // coeffs 6 7 6 7 6 7 6 7
+ coeffs[3] = _mm_shuffle_epi32(coeff, 0xff);
+}
+
+static INLINE __m128i convolve(const __m128i *const s,
+ const __m128i *const coeffs) {
+ const __m128i res_0 = _mm_madd_epi16(s[0], coeffs[0]);
+ const __m128i res_1 = _mm_madd_epi16(s[1], coeffs[1]);
+ const __m128i res_2 = _mm_madd_epi16(s[2], coeffs[2]);
+ const __m128i res_3 = _mm_madd_epi16(s[3], coeffs[3]);
+
+ const __m128i res =
+ _mm_add_epi32(_mm_add_epi32(res_0, res_1), _mm_add_epi32(res_2, res_3));
+
+ return res;
+}
+
+static INLINE __m128i convolve_lo_x(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[1], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[3], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_lo_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpacklo_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpacklo_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpacklo_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpacklo_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y(const __m128i *const s,
+ const __m128i *const coeffs) {
+ __m128i ss[4];
+ ss[0] = _mm_unpackhi_epi8(s[0], _mm_setzero_si128());
+ ss[1] = _mm_unpackhi_epi8(s[2], _mm_setzero_si128());
+ ss[2] = _mm_unpackhi_epi8(s[4], _mm_setzero_si128());
+ ss[3] = _mm_unpackhi_epi8(s[6], _mm_setzero_si128());
+ return convolve(ss, coeffs);
+}
+
+static INLINE __m128i comp_avg(const __m128i *const data_ref_0,
+ const __m128i *const res_unsigned,
+ const __m128i *const wt,
+ const int use_dist_wtd_avg) {
+ __m128i res;
+ if (use_dist_wtd_avg) {
+ const __m128i data_lo = _mm_unpacklo_epi16(*data_ref_0, *res_unsigned);
+ const __m128i data_hi = _mm_unpackhi_epi16(*data_ref_0, *res_unsigned);
+
+ const __m128i wt_res_lo = _mm_madd_epi16(data_lo, *wt);
+ const __m128i wt_res_hi = _mm_madd_epi16(data_hi, *wt);
+
+ const __m128i res_lo = _mm_srai_epi32(wt_res_lo, DIST_PRECISION_BITS);
+ const __m128i res_hi = _mm_srai_epi32(wt_res_hi, DIST_PRECISION_BITS);
+
+ res = _mm_packs_epi32(res_lo, res_hi);
+ } else {
+ const __m128i wt_res = _mm_add_epi16(*data_ref_0, *res_unsigned);
+ res = _mm_srai_epi16(wt_res, 1);
+ }
+ return res;
+}
+
+static INLINE __m128i convolve_rounding(const __m128i *const res_unsigned,
+ const __m128i *const offset_const,
+ const __m128i *const round_const,
+ const int round_shift) {
+ const __m128i res_signed = _mm_sub_epi16(*res_unsigned, *offset_const);
+ const __m128i res_round =
+ _mm_srai_epi16(_mm_add_epi16(res_signed, *round_const), round_shift);
+ return res_round;
+}
+
+static INLINE __m128i highbd_convolve_rounding_sse2(
+ const __m128i *const res_unsigned, const __m128i *const offset_const,
+ const __m128i *const round_const, const int round_shift) {
+ const __m128i res_signed = _mm_sub_epi32(*res_unsigned, *offset_const);
+ const __m128i res_round =
+ _mm_srai_epi32(_mm_add_epi32(res_signed, *round_const), round_shift);
+
+ return res_round;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_sse4_1.h b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
new file mode 100644
index 0000000000..b1a3bb4664
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_sse4_1.h
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
+
+// Note:
+// This header file should be put below any x86 intrinsics head file
+
+static INLINE void mult_add_store(CONV_BUF_TYPE *const dst,
+ const __m128i *const res,
+ const __m128i *const wt0,
+ const __m128i *const wt1,
+ const int do_average) {
+ __m128i d;
+ if (do_average) {
+ d = _mm_load_si128((__m128i *)dst);
+ d = _mm_add_epi32(_mm_mullo_epi32(d, *wt0), _mm_mullo_epi32(*res, *wt1));
+ d = _mm_srai_epi32(d, DIST_PRECISION_BITS);
+ } else {
+ d = *res;
+ }
+ _mm_store_si128((__m128i *)dst, d);
+}
+
+static INLINE __m128i highbd_comp_avg_sse4_1(const __m128i *const data_ref_0,
+ const __m128i *const res_unsigned,
+ const __m128i *const wt0,
+ const __m128i *const wt1,
+ const int use_dist_wtd_avg) {
+ __m128i res;
+ if (use_dist_wtd_avg) {
+ const __m128i wt0_res = _mm_mullo_epi32(*data_ref_0, *wt0);
+ const __m128i wt1_res = _mm_mullo_epi32(*res_unsigned, *wt1);
+
+ const __m128i wt_res = _mm_add_epi32(wt0_res, wt1_res);
+ res = _mm_srai_epi32(wt_res, DIST_PRECISION_BITS);
+ } else {
+ const __m128i wt_res = _mm_add_epi32(*data_ref_0, *res_unsigned);
+ res = _mm_srai_epi32(wt_res, 1);
+ }
+ return res;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSE4_1_H_
diff --git a/third_party/aom/aom_dsp/x86/convolve_ssse3.h b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
new file mode 100644
index 0000000000..b1abead146
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/convolve_ssse3.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+#define AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
+
+#include <tmmintrin.h> // SSSE3
+
+static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
+}
+
+static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ __m128i sum1, sum2;
+
+ // sum the results together, saturating only on the final step
+ // adding x0 with x2 and x1 with x3 is the only order that prevents
+ // outranges for all filters
+ sum1 = _mm_add_epi16(x0, x2);
+ sum2 = _mm_add_epi16(x1, x3);
+ // add the rounding offset early to avoid another saturated add
+ sum1 = _mm_add_epi16(sum1, k_64);
+ sum1 = _mm_adds_epi16(sum1, sum2);
+ // shift by 7 bit each 16 bit
+ sum1 = _mm_srai_epi16(sum1, 7);
+ return sum1;
+}
+
+#endif // AOM_AOM_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/fft_avx2.c b/third_party/aom/aom_dsp/x86/fft_avx2.c
new file mode 100644
index 0000000000..3f5a9bbeff
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_avx2.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+extern void aom_transpose_float_sse2(const float *A, float *B, int n);
+extern void aom_fft_unpack_2d_output_sse2(const float *col_fft, float *output,
+ int n);
+
+// Generate the 1d forward transforms for float using _mm256
+GEN_FFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_FFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_FFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+
+void aom_fft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+void aom_fft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_avx2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 8);
+}
+
+// Generate the 1d inverse transforms for float using _mm256
+GEN_IFFT_8(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_IFFT_16(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+GEN_IFFT_32(static INLINE void, avx2, float, __m256, _mm256_load_ps,
+ _mm256_store_ps, _mm256_set1_ps, _mm256_add_ps, _mm256_sub_ps,
+ _mm256_mul_ps)
+
+void aom_ifft8x8_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_avx2,
+ aom_ifft1d_8_avx2, aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft16x16_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_avx2, aom_ifft1d_16_avx2,
+ aom_transpose_float_sse2, 8);
+}
+
+void aom_ifft32x32_float_avx2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_avx2, aom_ifft1d_32_avx2,
+ aom_transpose_float_sse2, 8);
+}
diff --git a/third_party/aom/aom_dsp/x86/fft_sse2.c b/third_party/aom/aom_dsp/x86/fft_sse2.c
new file mode 100644
index 0000000000..bdd235bcd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fft_sse2.c
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+s * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/fft_common.h"
+
+static INLINE void transpose4x4(const float *A, float *B, const int lda,
+ const int ldb) {
+ __m128 row1 = _mm_load_ps(&A[0 * lda]);
+ __m128 row2 = _mm_load_ps(&A[1 * lda]);
+ __m128 row3 = _mm_load_ps(&A[2 * lda]);
+ __m128 row4 = _mm_load_ps(&A[3 * lda]);
+ _MM_TRANSPOSE4_PS(row1, row2, row3, row4);
+ _mm_store_ps(&B[0 * ldb], row1);
+ _mm_store_ps(&B[1 * ldb], row2);
+ _mm_store_ps(&B[2 * ldb], row3);
+ _mm_store_ps(&B[3 * ldb], row4);
+}
+
+// Referenced by fft_avx2.c.
+void aom_transpose_float_sse2(const float *A, float *B, int n);
+
+void aom_transpose_float_sse2(const float *A, float *B, int n) {
+ for (int y = 0; y < n; y += 4) {
+ for (int x = 0; x < n; x += 4) {
+ transpose4x4(A + y * n + x, B + x * n + y, n, n);
+ }
+ }
+}
+
+// Referenced by fft_avx2.c.
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n);
+
+void aom_fft_unpack_2d_output_sse2(const float *packed, float *output, int n) {
+ const int n2 = n / 2;
+ output[0] = packed[0];
+ output[1] = 0;
+ output[2 * (n2 * n)] = packed[n2 * n];
+ output[2 * (n2 * n) + 1] = 0;
+
+ output[2 * n2] = packed[n2];
+ output[2 * n2 + 1] = 0;
+ output[2 * (n2 * n + n2)] = packed[n2 * n + n2];
+ output[2 * (n2 * n + n2) + 1] = 0;
+
+ for (int c = 1; c < n2; ++c) {
+ output[2 * (0 * n + c)] = packed[c];
+ output[2 * (0 * n + c) + 1] = packed[c + n2];
+ output[2 * (n2 * n + c) + 0] = packed[n2 * n + c];
+ output[2 * (n2 * n + c) + 1] = packed[n2 * n + c + n2];
+ }
+ for (int r = 1; r < n2; ++r) {
+ output[2 * (r * n + 0)] = packed[r * n];
+ output[2 * (r * n + 0) + 1] = packed[(r + n2) * n];
+ output[2 * (r * n + n2) + 0] = packed[r * n + n2];
+ output[2 * (r * n + n2) + 1] = packed[(r + n2) * n + n2];
+
+ for (int c = 1; c < AOMMIN(n2, 4); ++c) {
+ output[2 * (r * n + c)] =
+ packed[r * n + c] - packed[(r + n2) * n + c + n2];
+ output[2 * (r * n + c) + 1] =
+ packed[(r + n2) * n + c] + packed[r * n + c + n2];
+ }
+
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r * n + c + n2);
+ real1 = _mm_sub_ps(real1, real2);
+ imag1 = _mm_add_ps(imag1, imag2);
+ _mm_store_ps(output + 2 * (r * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r * n + c + 2), _mm_unpackhi_ps(real1, imag1));
+ }
+
+ int r2 = r + n2;
+ int r3 = n - r2;
+ output[2 * (r2 * n + 0)] = packed[r3 * n];
+ output[2 * (r2 * n + 0) + 1] = -packed[(r3 + n2) * n];
+ output[2 * (r2 * n + n2)] = packed[r3 * n + n2];
+ output[2 * (r2 * n + n2) + 1] = -packed[(r3 + n2) * n + n2];
+ for (int c = 1; c < AOMMIN(4, n2); ++c) {
+ output[2 * (r2 * n + c)] =
+ packed[r3 * n + c] + packed[(r3 + n2) * n + c + n2];
+ output[2 * (r2 * n + c) + 1] =
+ -packed[(r3 + n2) * n + c] + packed[r3 * n + c + n2];
+ }
+ for (int c = 4; c < n2; c += 4) {
+ __m128 real1 = _mm_load_ps(packed + r3 * n + c);
+ __m128 real2 = _mm_load_ps(packed + (r3 + n2) * n + c + n2);
+ __m128 imag1 = _mm_load_ps(packed + (r3 + n2) * n + c);
+ __m128 imag2 = _mm_load_ps(packed + r3 * n + c + n2);
+ real1 = _mm_add_ps(real1, real2);
+ imag1 = _mm_sub_ps(imag2, imag1);
+ _mm_store_ps(output + 2 * (r2 * n + c), _mm_unpacklo_ps(real1, imag1));
+ _mm_store_ps(output + 2 * (r2 * n + c + 2),
+ _mm_unpackhi_ps(real1, imag1));
+ }
+ }
+}
+
+// Generate definitions for 1d transforms using float and __mm128
+GEN_FFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_FFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_FFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_fft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 4, aom_fft1d_4_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 8, aom_fft1d_8_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 16, aom_fft1d_16_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+void aom_fft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_fft_2d_gen(input, temp, output, 32, aom_fft1d_32_sse2,
+ aom_transpose_float_sse2, aom_fft_unpack_2d_output_sse2, 4);
+}
+
+// Generate definitions for 1d inverse transforms using float and mm128
+GEN_IFFT_4(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps)
+GEN_IFFT_8(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_16(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+GEN_IFFT_32(static INLINE void, sse2, float, __m128, _mm_load_ps, _mm_store_ps,
+ _mm_set1_ps, _mm_add_ps, _mm_sub_ps, _mm_mul_ps)
+
+void aom_ifft4x4_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 4, aom_fft1d_4_float, aom_fft1d_4_sse2,
+ aom_ifft1d_4_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft8x8_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 8, aom_fft1d_8_float, aom_fft1d_8_sse2,
+ aom_ifft1d_8_sse2, aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft16x16_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 16, aom_fft1d_16_float,
+ aom_fft1d_16_sse2, aom_ifft1d_16_sse2,
+ aom_transpose_float_sse2, 4);
+}
+
+void aom_ifft32x32_float_sse2(const float *input, float *temp, float *output) {
+ aom_ifft_2d_gen(input, temp, output, 32, aom_fft1d_32_float,
+ aom_fft1d_32_sse2, aom_ifft1d_32_sse2,
+ aom_transpose_float_sse2, 4);
+}
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
new file mode 100644
index 0000000000..7ee8ba330e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -0,0 +1,529 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/txfm_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+#include "aom_dsp/x86/txfm_common_sse2.h"
+#include "aom_ports/mem.h"
+
+// TODO(jingning) The high bit-depth functions need rework for performance.
+// After we properly fix the high bit-depth function implementations, this
+// file's dependency should be substantially simplified.
+#if DCT_HIGH_BIT_DEPTH
+#define ADD_EPI16 _mm_adds_epi16
+#define SUB_EPI16 _mm_subs_epi16
+
+#else
+#define ADD_EPI16 _mm_add_epi16
+#define SUB_EPI16 _mm_sub_epi16
+#endif
+
+static void FDCT4x4_2D_HELPER(const int16_t *input, int stride, __m128i *in0,
+ __m128i *in1) {
+ // Constants
+ // These are the coefficients used for the multiplies.
+ // In the comments, pN means cos(N pi /64) and mN is -cos(N pi /64),
+ // where cospi_N_64 = cos(N pi /64)
+ const __m128i k__cospi_A =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_B =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_C =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64);
+ const __m128i k__cospi_D =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_E =
+ octa_set_epi16(cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64,
+ cospi_16_64, cospi_16_64, cospi_16_64, cospi_16_64);
+ const __m128i k__cospi_F =
+ octa_set_epi16(cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64,
+ cospi_16_64, -cospi_16_64, cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_G =
+ octa_set_epi16(cospi_8_64, cospi_24_64, cospi_8_64, cospi_24_64,
+ -cospi_8_64, -cospi_24_64, -cospi_8_64, -cospi_24_64);
+ const __m128i k__cospi_H =
+ octa_set_epi16(cospi_24_64, -cospi_8_64, cospi_24_64, -cospi_8_64,
+ -cospi_24_64, cospi_8_64, -cospi_24_64, cospi_8_64);
+
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ // This second rounding constant saves doing some extra adds at the end
+ const __m128i k__DCT_CONST_ROUNDING2 =
+ _mm_set1_epi32(DCT_CONST_ROUNDING + (DCT_CONST_ROUNDING << 1));
+ const int DCT_CONST_BITS2 = DCT_CONST_BITS + 2;
+ const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
+ const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
+
+ // Load inputs.
+ *in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+ *in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+ *in1 = _mm_unpacklo_epi64(
+ *in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
+ *in0 = _mm_unpacklo_epi64(
+ *in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ // multiply by 16 to give some extra precision
+ *in0 = _mm_slli_epi16(*in0, 4);
+ *in1 = _mm_slli_epi16(*in1, 4);
+ // if (i == 0 && input[0]) input[0] += 1;
+ // add 1 to the upper left pixel if it is non-zero, which helps reduce
+ // the round-trip error
+ {
+ // The mask will only contain whether the first value is zero, all
+ // other comparison will fail as something shifted by 4 (above << 4)
+ // can never be equal to one. To increment in the non-zero case, we
+ // add the mask and one for the first element:
+ // - if zero, mask = -1, v = v - 1 + 1 = v
+ // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
+ __m128i mask = _mm_cmpeq_epi16(*in0, k__nonzero_bias_a);
+ *in0 = _mm_add_epi16(*in0, mask);
+ *in0 = _mm_add_epi16(*in0, k__nonzero_bias_b);
+ }
+ // There are 4 total stages, alternating between an add/subtract stage
+ // followed by an multiply-and-add stage.
+ {
+ // Stage 1: Add/subtract
+
+ // in0 = [i0 i1 i2 i3 iC iD iE iF]
+ // in1 = [i4 i5 i6 i7 i8 i9 iA iB]
+ const __m128i r0 = _mm_unpacklo_epi16(*in0, *in1);
+ const __m128i r1 = _mm_unpackhi_epi16(*in0, *in1);
+ // r0 = [i0 i4 i1 i5 i2 i6 i3 i7]
+ // r1 = [iC i8 iD i9 iE iA iF iB]
+ const __m128i r2 = _mm_shuffle_epi32(r0, 0xB4);
+ const __m128i r3 = _mm_shuffle_epi32(r1, 0xB4);
+ // r2 = [i0 i4 i1 i5 i3 i7 i2 i6]
+ // r3 = [iC i8 iD i9 iF iB iE iA]
+
+ const __m128i t0 = _mm_add_epi16(r2, r3);
+ const __m128i t1 = _mm_sub_epi16(r2, r3);
+ // t0 = [a0 a4 a1 a5 a3 a7 a2 a6]
+ // t1 = [aC a8 aD a9 aF aB aE aA]
+
+ // Stage 2: multiply by constants (which gets us into 32 bits).
+ // The constants needed here are:
+ // k__cospi_A = [p16 p16 p16 p16 p16 m16 p16 m16]
+ // k__cospi_B = [p16 m16 p16 m16 p16 p16 p16 p16]
+ // k__cospi_C = [p08 p24 p08 p24 p24 m08 p24 m08]
+ // k__cospi_D = [p24 m08 p24 m08 p08 p24 p08 p24]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_A);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_B);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_C);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_D);
+ // Then add and right-shift to get back to 16-bit range
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ // w0 = [b0 b1 b7 b6]
+ // w1 = [b8 b9 bF bE]
+ // w2 = [b4 b5 b3 b2]
+ // w3 = [bC bD bB bA]
+ const __m128i x0 = _mm_packs_epi32(w0, w1);
+ const __m128i x1 = _mm_packs_epi32(w2, w3);
+
+ // x0 = [b0 b1 b7 b6 b8 b9 bF bE]
+ // x1 = [b4 b5 b3 b2 bC bD bB bA]
+ *in0 = _mm_shuffle_epi32(x0, 0xD8);
+ *in1 = _mm_shuffle_epi32(x1, 0x8D);
+ // in0 = [b0 b1 b8 b9 b7 b6 bF bE]
+ // in1 = [b3 b2 bB bA b4 b5 bC bD]
+ }
+ {
+ // vertical DCTs finished. Now we do the horizontal DCTs.
+ // Stage 3: Add/subtract
+
+ const __m128i t0 = ADD_EPI16(*in0, *in1);
+ const __m128i t1 = SUB_EPI16(*in0, *in1);
+
+ // Stage 4: multiply by constants (which gets us into 32 bits).
+ {
+ // The constants needed here are:
+ // k__cospi_E = [p16 p16 p16 p16 p16 p16 p16 p16]
+ // k__cospi_F = [p16 m16 p16 m16 p16 m16 p16 m16]
+ // k__cospi_G = [p08 p24 p08 p24 m08 m24 m08 m24]
+ // k__cospi_H = [p24 m08 p24 m08 m24 p08 m24 p08]
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_E);
+ const __m128i u1 = _mm_madd_epi16(t0, k__cospi_F);
+ const __m128i u2 = _mm_madd_epi16(t1, k__cospi_G);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_H);
+ // Then add and right-shift to get back to 16-bit range
+ // but this combines the final right-shift as well to save operations
+ // This unusual rounding operations is to maintain bit-accurate
+ // compatibility with the c version of this function which has two
+ // rounding steps in a row.
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING2);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING2);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING2);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING2);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS2);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS2);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS2);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS2);
+ *in0 = _mm_packs_epi32(w0, w2);
+ *in1 = _mm_packs_epi32(w1, w3);
+ }
+ }
+}
+
+void FDCT4x4_2D(const int16_t *input, tran_low_t *output, int stride) {
+ // This 2D transform implements 4 vertical 1D transforms followed
+ // by 4 horizontal 1D transforms. The multiplies and adds are as given
+ // by Chen, Smith and Fralick ('77). The commands for moving the data
+ // around have been minimized by hand.
+ // For the purposes of the comments, the 16 inputs are referred to at i0
+ // through iF (in raster order), intermediate variables are a0, b0, c0
+ // through f, and correspond to the in-place computations mapped to input
+ // locations. The outputs, o0 through oF are labeled according to the
+ // output locations.
+ __m128i in0, in1;
+ FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+
+ // Post-condition (v + 1) >> 2 is now incorporated into previous
+ // add and right-shift commands. Only 2 store instructions needed
+ // because we are using the fact that 1/3 are stored just after 0/2.
+ storeu_output(&in0, output + 0 * 4);
+ storeu_output(&in1, output + 2 * 4);
+}
+
+void FDCT4x4_2D_LP(const int16_t *input, int16_t *output, int stride) {
+ __m128i in0, in1;
+ FDCT4x4_2D_HELPER(input, stride, &in0, &in1);
+ _mm_storeu_si128((__m128i *)(output + 0 * 4), in0);
+ _mm_storeu_si128((__m128i *)(output + 2 * 4), in1);
+}
+
+#if CONFIG_INTERNAL_STATS
+void FDCT8x8_2D(const int16_t *input, tran_low_t *output, int stride) {
+ int pass;
+ // Constants
+ // When we use them, in one case, they are all the same. In all others
+ // it's a pair of them that we need to repeat four times. This is done
+ // by constructing the 32 bit constant corresponding to that pair.
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
+ const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+#if DCT_HIGH_BIT_DEPTH
+ int overflow;
+#endif
+ // Load input
+ __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
+ __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
+ __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
+ __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
+ __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
+ __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
+ __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
+ __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
+ // Pre-condition input (shift by two)
+ in0 = _mm_slli_epi16(in0, 2);
+ in1 = _mm_slli_epi16(in1, 2);
+ in2 = _mm_slli_epi16(in2, 2);
+ in3 = _mm_slli_epi16(in3, 2);
+ in4 = _mm_slli_epi16(in4, 2);
+ in5 = _mm_slli_epi16(in5, 2);
+ in6 = _mm_slli_epi16(in6, 2);
+ in7 = _mm_slli_epi16(in7, 2);
+
+ // We do two passes, first the columns, then the rows. The results of the
+ // first pass are transposed so that the same column code can be reused. The
+ // results of the second pass are also transposed so that the rows (processed
+ // as columns) are put back in row positions.
+ for (pass = 0; pass < 2; pass++) {
+ // To store results of each pass before the transpose.
+ __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+ // Add/subtract
+ const __m128i q0 = ADD_EPI16(in0, in7);
+ const __m128i q1 = ADD_EPI16(in1, in6);
+ const __m128i q2 = ADD_EPI16(in2, in5);
+ const __m128i q3 = ADD_EPI16(in3, in4);
+ const __m128i q4 = SUB_EPI16(in3, in4);
+ const __m128i q5 = SUB_EPI16(in2, in5);
+ const __m128i q6 = SUB_EPI16(in1, in6);
+ const __m128i q7 = SUB_EPI16(in0, in7);
+#if DCT_HIGH_BIT_DEPTH
+ if (pass == 1) {
+ overflow =
+ check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Work on first four results
+ {
+ // Add/subtract
+ const __m128i r0 = ADD_EPI16(q0, q3);
+ const __m128i r1 = ADD_EPI16(q1, q2);
+ const __m128i r2 = SUB_EPI16(q1, q2);
+ const __m128i r3 = SUB_EPI16(q0, q3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
+ const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
+ const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
+ const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res0 = _mm_packs_epi32(w0, w1);
+ res4 = _mm_packs_epi32(w2, w3);
+ res2 = _mm_packs_epi32(w4, w5);
+ res6 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res0, &res4, &res2, &res6);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ // Work on next four results
+ {
+ // Interleave to do the multiply by constants which gets us into 32bits
+ const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
+ const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
+ const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
+ const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
+ const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
+ const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
+ // dct_const_round_shift
+ const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
+ const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
+ const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
+ const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
+ const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
+ const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
+ const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
+ const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
+ // Combine
+ const __m128i r0 = _mm_packs_epi32(s0, s1);
+ const __m128i r1 = _mm_packs_epi32(s2, s3);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x2(&r0, &r1);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ {
+ // Add/subtract
+ const __m128i x0 = ADD_EPI16(q4, r0);
+ const __m128i x1 = SUB_EPI16(q4, r0);
+ const __m128i x2 = SUB_EPI16(q7, r1);
+ const __m128i x3 = ADD_EPI16(q7, r1);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ // Interleave to do the multiply by constants which gets us into 32bits
+ {
+ const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
+ const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
+ const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
+ const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
+ const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
+ const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
+ const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
+ const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
+ const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
+ const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
+ const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
+ const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
+ // dct_const_round_shift
+ const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+ const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+ const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+ const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+ const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+ const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+ const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+ const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+ const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+ const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+ const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+ const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+ const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+ const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+ const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+ const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+ // Combine
+ res1 = _mm_packs_epi32(w0, w1);
+ res7 = _mm_packs_epi32(w2, w3);
+ res5 = _mm_packs_epi32(w4, w5);
+ res3 = _mm_packs_epi32(w6, w7);
+#if DCT_HIGH_BIT_DEPTH
+ overflow = check_epi16_overflow_x4(&res1, &res7, &res5, &res3);
+ if (overflow) {
+ aom_highbd_fdct8x8_c(input, output, stride);
+ return;
+ }
+#endif // DCT_HIGH_BIT_DEPTH
+ }
+ }
+ }
+ // Transpose the 8x8.
+ {
+ // 00 01 02 03 04 05 06 07
+ // 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27
+ // 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47
+ // 50 51 52 53 54 55 56 57
+ // 60 61 62 63 64 65 66 67
+ // 70 71 72 73 74 75 76 77
+ const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
+ // 00 10 01 11 02 12 03 13
+ // 20 30 21 31 22 32 23 33
+ // 04 14 05 15 06 16 07 17
+ // 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53
+ // 60 70 61 71 62 72 63 73
+ // 54 54 55 55 56 56 57 57
+ // 64 74 65 75 66 76 67 77
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
+ // 00 10 20 30 01 11 21 31
+ // 40 50 60 70 41 51 61 71
+ // 02 12 22 32 03 13 23 33
+ // 42 52 62 72 43 53 63 73
+ // 04 14 24 34 05 15 21 36
+ // 44 54 64 74 45 55 61 76
+ // 06 16 26 36 07 17 27 37
+ // 46 56 66 76 47 57 67 77
+ in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
+ in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
+ in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
+ in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
+ in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
+ in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
+ in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
+ in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ }
+ // Post-condition output and store it
+ {
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
+ const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
+ const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
+ const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
+ const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
+ const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
+ const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
+ const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
+ in0 = _mm_sub_epi16(in0, sign_in0);
+ in1 = _mm_sub_epi16(in1, sign_in1);
+ in2 = _mm_sub_epi16(in2, sign_in2);
+ in3 = _mm_sub_epi16(in3, sign_in3);
+ in4 = _mm_sub_epi16(in4, sign_in4);
+ in5 = _mm_sub_epi16(in5, sign_in5);
+ in6 = _mm_sub_epi16(in6, sign_in6);
+ in7 = _mm_sub_epi16(in7, sign_in7);
+ in0 = _mm_srai_epi16(in0, 1);
+ in1 = _mm_srai_epi16(in1, 1);
+ in2 = _mm_srai_epi16(in2, 1);
+ in3 = _mm_srai_epi16(in3, 1);
+ in4 = _mm_srai_epi16(in4, 1);
+ in5 = _mm_srai_epi16(in5, 1);
+ in6 = _mm_srai_epi16(in6, 1);
+ in7 = _mm_srai_epi16(in7, 1);
+ // store results
+ store_output(&in0, (output + 0 * 8));
+ store_output(&in1, (output + 1 * 8));
+ store_output(&in2, (output + 2 * 8));
+ store_output(&in3, (output + 3 * 8));
+ store_output(&in4, (output + 4 * 8));
+ store_output(&in5, (output + 5 * 8));
+ store_output(&in6, (output + 6 * 8));
+ store_output(&in7, (output + 7 * 8));
+ }
+}
+#endif // CONFIG_INTERNAL_STATS
+
+#undef ADD_EPI16
+#undef SUB_EPI16
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
new file mode 100644
index 0000000000..0e4fb80468
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/fwd_txfm_sse2.h"
+
+#define DCT_HIGH_BIT_DEPTH 0
+#define FDCT4x4_2D_HELPER fdct4x4_helper
+#define FDCT4x4_2D aom_fdct4x4_sse2
+#define FDCT4x4_2D_LP aom_fdct4x4_lp_sse2
+#define FDCT8x8_2D aom_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
+#undef FDCT4x4_2D_HELPER
+#undef FDCT4x4_2D
+#undef FDCT4x4_2D_LP
+#undef FDCT8x8_2D
+
+#if CONFIG_AV1_HIGHBITDEPTH
+
+#undef DCT_HIGH_BIT_DEPTH
+#define DCT_HIGH_BIT_DEPTH 1
+#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
+#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
+#undef FDCT8x8_2D
+
+#endif
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
new file mode 100644
index 0000000000..78ea98522e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_sse2.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+#define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m128i k_madd_epi32(__m128i a, __m128i b) {
+ __m128i buf0, buf1;
+ buf0 = _mm_mul_epu32(a, b);
+ a = _mm_srli_epi64(a, 32);
+ b = _mm_srli_epi64(b, 32);
+ buf1 = _mm_mul_epu32(a, b);
+ return _mm_add_epi64(buf0, buf1);
+}
+
+static INLINE __m128i k_packs_epi64(__m128i a, __m128i b) {
+ __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0));
+ __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0));
+ return _mm_unpacklo_epi64(buf0, buf1);
+}
+
+static INLINE int check_epi16_overflow_x2(const __m128i *preg0,
+ const __m128i *preg1) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ cmp0 = _mm_or_si128(cmp0, cmp1);
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x4(const __m128i *preg0,
+ const __m128i *preg1,
+ const __m128i *preg2,
+ const __m128i *preg3) {
+ const __m128i max_overflow = _mm_set1_epi16(0x7fff);
+ const __m128i min_overflow = _mm_set1_epi16((short)0x8000);
+ __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow),
+ _mm_cmpeq_epi16(*preg0, min_overflow));
+ __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow),
+ _mm_cmpeq_epi16(*preg1, min_overflow));
+ __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow),
+ _mm_cmpeq_epi16(*preg2, min_overflow));
+ __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow),
+ _mm_cmpeq_epi16(*preg3, min_overflow));
+ cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3));
+ return _mm_movemask_epi8(cmp0);
+}
+
+static INLINE int check_epi16_overflow_x8(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x12(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x16(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ }
+ return res0 + res1;
+}
+
+static INLINE int check_epi16_overflow_x32(
+ const __m128i *preg0, const __m128i *preg1, const __m128i *preg2,
+ const __m128i *preg3, const __m128i *preg4, const __m128i *preg5,
+ const __m128i *preg6, const __m128i *preg7, const __m128i *preg8,
+ const __m128i *preg9, const __m128i *preg10, const __m128i *preg11,
+ const __m128i *preg12, const __m128i *preg13, const __m128i *preg14,
+ const __m128i *preg15, const __m128i *preg16, const __m128i *preg17,
+ const __m128i *preg18, const __m128i *preg19, const __m128i *preg20,
+ const __m128i *preg21, const __m128i *preg22, const __m128i *preg23,
+ const __m128i *preg24, const __m128i *preg25, const __m128i *preg26,
+ const __m128i *preg27, const __m128i *preg28, const __m128i *preg29,
+ const __m128i *preg30, const __m128i *preg31) {
+ int res0, res1;
+ res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3);
+ res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19);
+ if (!res1) {
+ res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23);
+ if (!res0) {
+ res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27);
+ if (!res1)
+ res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31);
+ }
+ }
+ }
+ }
+ }
+ return res0 + res1;
+}
+
+static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
new file mode 100644
index 0000000000..06879040b0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm
@@ -0,0 +1,379 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+
+pw_11585x2: times 8 dw 23170
+pd_8192: times 4 dd 8192
+
+%macro TRANSFORM_COEFFS 2
+pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
+pw_%2_m%1: dw %2, -%1, %2, -%1, %2, -%1, %2, -%1
+%endmacro
+
+TRANSFORM_COEFFS 11585, 11585
+TRANSFORM_COEFFS 15137, 6270
+TRANSFORM_COEFFS 16069, 3196
+TRANSFORM_COEFFS 9102, 13623
+
+%macro STORE_OUTPUT 2 ; index, result
+ ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ ; _mm_store_si128((__m128i *)(dst_ptr), out0);
+ ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ pxor m11, m11
+ pcmpgtw m11, m%2
+ movdqa m12, m%2
+ punpcklwd m%2, m11
+ punpckhwd m12, m11
+ mova [outputq + 4*%1 + 0], m%2
+ mova [outputq + 4*%1 + 16], m12
+%endmacro
+
+SECTION .text
+
+%if AOM_ARCH_X86_64
+INIT_XMM ssse3
+cglobal fdct8x8, 3, 5, 13, input, output, stride
+
+ mova m8, [GLOBAL(pd_8192)]
+ mova m12, [GLOBAL(pw_11585x2)]
+
+ lea r3, [2 * strideq]
+ lea r4, [4 * strideq]
+ mova m0, [inputq]
+ mova m1, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m2, [inputq]
+ mova m3, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m4, [inputq]
+ mova m5, [inputq + r3]
+ lea inputq, [inputq + r4]
+ mova m6, [inputq]
+ mova m7, [inputq + r3]
+
+ ; left shift by 2 to increase forward transformation precision
+ psllw m0, 2
+ psllw m1, 2
+ psllw m2, 2
+ psllw m3, 2
+ psllw m4, 2
+ psllw m5, 2
+ psllw m6, 2
+ psllw m7, 2
+
+ ; column transform
+ ; stage 1
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ paddw m9, m1, m6
+ psubw m1, m6
+
+ paddw m7, m2, m5
+ psubw m2, m5
+
+ paddw m6, m3, m4
+ psubw m3, m4
+
+ ; stage 2
+ paddw m5, m9, m7
+ psubw m9, m7
+
+ paddw m4, m10, m6
+ psubw m10, m6
+
+ paddw m7, m1, m2
+ psubw m1, m2
+
+ ; stage 3
+ paddw m6, m4, m5
+ psubw m4, m5
+
+ pmulhrsw m1, m12
+ pmulhrsw m7, m12
+
+ ; sin(pi / 8), cos(pi / 8)
+ punpcklwd m2, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
+ pmaddwd m2, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
+ pmaddwd m10, [GLOBAL(pw_6270_m15137)]
+ paddd m5, m8
+ paddd m2, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m2, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m5, m9
+ packssdw m2, m10
+
+ pmulhrsw m6, m12
+ pmulhrsw m4, m12
+
+ paddw m9, m3, m1
+ psubw m3, m1
+
+ paddw m10, m0, m7
+ psubw m0, m7
+
+ ; stage 4
+ ; sin(pi / 16), cos(pi / 16)
+ punpcklwd m1, m10, m9
+ punpckhwd m10, m9
+ pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
+ pmaddwd m1, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
+ pmaddwd m10, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m1, m8
+ paddd m9, m8
+ paddd m10, m8
+ psrad m7, 14
+ psrad m1, 14
+ psrad m9, 14
+ psrad m10, 14
+ packssdw m7, m9
+ packssdw m1, m10
+
+ ; sin(3 * pi / 16), cos(3 * pi / 16)
+ punpcklwd m11, m0, m3
+ punpckhwd m0, m3
+ pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
+ pmaddwd m11, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
+ pmaddwd m0, [GLOBAL(pw_13623_m9102)]
+ paddd m9, m8
+ paddd m11, m8
+ paddd m3, m8
+ paddd m0, m8
+ psrad m9, 14
+ psrad m11, 14
+ psrad m3, 14
+ psrad m0, 14
+ packssdw m9, m3
+ packssdw m11, m0
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m6, m7
+ punpcklwd m3, m5, m11
+ punpckhwd m6, m7
+ punpckhwd m5, m11
+ punpcklwd m7, m4, m9
+ punpcklwd m10, m2, m1
+ punpckhwd m4, m9
+ punpckhwd m2, m1
+
+ ; stage 2
+ punpckldq m9, m0, m3
+ punpckldq m1, m6, m5
+ punpckhdq m0, m3
+ punpckhdq m6, m5
+ punpckldq m3, m7, m10
+ punpckldq m5, m4, m2
+ punpckhdq m7, m10
+ punpckhdq m4, m2
+
+ ; stage 3
+ punpcklqdq m10, m9, m3
+ punpckhqdq m9, m3
+ punpcklqdq m2, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m3, m1, m5
+ punpckhqdq m1, m5
+ punpcklqdq m7, m6, m4
+ punpckhqdq m6, m4
+
+ ; row transform
+ ; stage 1
+ paddw m5, m10, m6
+ psubw m10, m6
+
+ paddw m4, m9, m7
+ psubw m9, m7
+
+ paddw m6, m2, m1
+ psubw m2, m1
+
+ paddw m7, m0, m3
+ psubw m0, m3
+
+ ;stage 2
+ paddw m1, m5, m7
+ psubw m5, m7
+
+ paddw m3, m4, m6
+ psubw m4, m6
+
+ paddw m7, m9, m2
+ psubw m9, m2
+
+ ; stage 3
+ punpcklwd m6, m1, m3
+ punpckhwd m1, m3
+ pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
+ pmaddwd m6, [GLOBAL(pw_11585_m11585)]
+ pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
+ pmaddwd m1, [GLOBAL(pw_11585_m11585)]
+ paddd m2, m8
+ paddd m6, m8
+ paddd m3, m8
+ paddd m1, m8
+ psrad m2, 14
+ psrad m6, 14
+ psrad m3, 14
+ psrad m1, 14
+ packssdw m2, m3
+ packssdw m6, m1
+
+ pmulhrsw m7, m12
+ pmulhrsw m9, m12
+
+ punpcklwd m3, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
+ pmaddwd m3, [GLOBAL(pw_6270_m15137)]
+ pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
+ pmaddwd m5, [GLOBAL(pw_6270_m15137)]
+ paddd m1, m8
+ paddd m3, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m1, 14
+ psrad m3, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m1, m4
+ packssdw m3, m5
+
+ paddw m4, m0, m9
+ psubw m0, m9
+
+ paddw m5, m10, m7
+ psubw m10, m7
+
+ ; stage 4
+ punpcklwd m9, m5, m4
+ punpckhwd m5, m4
+ pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
+ pmaddwd m9, [GLOBAL(pw_3196_m16069)]
+ pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
+ pmaddwd m5, [GLOBAL(pw_3196_m16069)]
+ paddd m7, m8
+ paddd m9, m8
+ paddd m4, m8
+ paddd m5, m8
+ psrad m7, 14
+ psrad m9, 14
+ psrad m4, 14
+ psrad m5, 14
+ packssdw m7, m4
+ packssdw m9, m5
+
+ punpcklwd m4, m10, m0
+ punpckhwd m10, m0
+ pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
+ pmaddwd m4, [GLOBAL(pw_13623_m9102)]
+ pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
+ pmaddwd m10, [GLOBAL(pw_13623_m9102)]
+ paddd m5, m8
+ paddd m4, m8
+ paddd m0, m8
+ paddd m10, m8
+ psrad m5, 14
+ psrad m4, 14
+ psrad m0, 14
+ psrad m10, 14
+ packssdw m5, m0
+ packssdw m4, m10
+
+ ; transpose
+ ; stage 1
+ punpcklwd m0, m2, m7
+ punpcklwd m10, m1, m4
+ punpckhwd m2, m7
+ punpckhwd m1, m4
+ punpcklwd m7, m6, m5
+ punpcklwd m4, m3, m9
+ punpckhwd m6, m5
+ punpckhwd m3, m9
+
+ ; stage 2
+ punpckldq m5, m0, m10
+ punpckldq m9, m2, m1
+ punpckhdq m0, m10
+ punpckhdq m2, m1
+ punpckldq m10, m7, m4
+ punpckldq m1, m6, m3
+ punpckhdq m7, m4
+ punpckhdq m6, m3
+
+ ; stage 3
+ punpcklqdq m4, m5, m10
+ punpckhqdq m5, m10
+ punpcklqdq m3, m0, m7
+ punpckhqdq m0, m7
+ punpcklqdq m10, m9, m1
+ punpckhqdq m9, m1
+ punpcklqdq m7, m2, m6
+ punpckhqdq m2, m6
+
+ psraw m1, m4, 15
+ psraw m6, m5, 15
+ psraw m8, m3, 15
+ psraw m11, m0, 15
+
+ psubw m4, m1
+ psubw m5, m6
+ psubw m3, m8
+ psubw m0, m11
+
+ psraw m4, 1
+ psraw m5, 1
+ psraw m3, 1
+ psraw m0, 1
+
+ psraw m1, m10, 15
+ psraw m6, m9, 15
+ psraw m8, m7, 15
+ psraw m11, m2, 15
+
+ psubw m10, m1
+ psubw m9, m6
+ psubw m7, m8
+ psubw m2, m11
+
+ psraw m10, 1
+ psraw m9, 1
+ psraw m7, 1
+ psraw m2, 1
+
+ STORE_OUTPUT 0, 4
+ STORE_OUTPUT 8, 5
+ STORE_OUTPUT 16, 3
+ STORE_OUTPUT 24, 0
+ STORE_OUTPUT 32, 10
+ STORE_OUTPUT 40, 9
+ STORE_OUTPUT 48, 7
+ STORE_OUTPUT 56, 2
+
+ RET
+%endif
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
new file mode 100644
index 0000000000..05c87bcff9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_avx2.c
@@ -0,0 +1,456 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void highbd_load_b_values_avx2(
+ const int16_t *zbin_ptr, __m256i *zbin, const int16_t *round_ptr,
+ __m256i *round, const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr, __m256i *dequant, const int16_t *shift_ptr,
+ __m256i *shift) {
+ *zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_sub_epi32(*zbin, _mm256_set1_epi32(1));
+ *round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+ *quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+ *dequant =
+ _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+ *shift = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)shift_ptr));
+}
+
+static INLINE void highbd_update_mask1_avx2(__m256i *cmp_mask,
+ const int16_t *iscan_ptr,
+ int *is_found, __m256i *mask) {
+ __m256i temp_mask = _mm256_setzero_si256();
+ if (_mm256_movemask_epi8(*cmp_mask)) {
+ __m256i iscan = _mm256_loadu_si256((const __m256i *)(iscan_ptr));
+ temp_mask = _mm256_and_si256(*cmp_mask, iscan);
+ *is_found = 1;
+ }
+ *mask = _mm256_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0_avx2(__m256i *qcoeff0, __m256i *qcoeff1,
+ __m256i *threshold,
+ const int16_t *iscan_ptr,
+ int *is_found, __m256i *mask) {
+ __m256i coeff[2], cmp_mask0, cmp_mask1;
+ coeff[0] = _mm256_slli_epi32(*qcoeff0, AOM_QM_BITS);
+ cmp_mask0 = _mm256_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm256_slli_epi32(*qcoeff1, AOM_QM_BITS);
+ cmp_mask1 = _mm256_cmpgt_epi32(coeff[1], threshold[1]);
+ cmp_mask0 =
+ _mm256_permute4x64_epi64(_mm256_packs_epi32(cmp_mask0, cmp_mask1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE void highbd_mul_shift_avx2(const __m256i *x, const __m256i *y,
+ __m256i *p, const int shift) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+ prod_lo = _mm256_srli_epi64(prod_lo, shift);
+ prod_hi = _mm256_srli_epi64(prod_hi, shift);
+
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ *p = _mm256_blend_epi32(prod_lo, prod_hi, 0xaa);
+}
+
+static INLINE void highbd_calculate_qcoeff_avx2(__m256i *coeff,
+ const __m256i *round,
+ const __m256i *quant,
+ const __m256i *shift,
+ const int *log_scale) {
+ __m256i tmp, qcoeff;
+ qcoeff = _mm256_add_epi32(*coeff, *round);
+ highbd_mul_shift_avx2(&qcoeff, quant, &tmp, 16);
+ qcoeff = _mm256_add_epi32(tmp, qcoeff);
+ highbd_mul_shift_avx2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_avx2(__m256i qcoeff,
+ __m256i dequant) {
+ return _mm256_mullo_epi32(qcoeff, dequant);
+}
+
+static INLINE __m256i highbd_calculate_dqcoeff_log_scale_avx2(
+ __m256i qcoeff, __m256i dequant, const int log_scale) {
+ __m256i abs_coeff = _mm256_abs_epi32(qcoeff);
+ highbd_mul_shift_avx2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+ return _mm256_sign_epi32(abs_coeff, qcoeff);
+}
+
+static INLINE void highbd_store_coefficients_avx2(__m256i coeff0,
+ __m256i coeff1,
+ tran_low_t *coeff_ptr) {
+ _mm256_store_si256((__m256i *)(coeff_ptr), coeff0);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff1);
+}
+
+void aom_highbd_quantize_b_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ int prescan_add[2];
+ int thresh[2];
+ const int log_scale = 0;
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ highbd_load_b_values_avx2(zbin_ptr, &zbin, round_ptr, &round, quant_ptr,
+ &quant, dequant_ptr, &dequant, quant_shift_ptr,
+ &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+ &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm256_unpackhi_epi64(zbin, zbin);
+ __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ round = _mm256_unpackhi_epi64(round, round);
+ quant = _mm256_unpackhi_epi64(quant, quant);
+ shift = _mm256_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ // Reinsert signs
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+ coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+ dequant = _mm256_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+ coeff0 = highbd_calculate_dqcoeff_avx2(qcoeff0, dequant);
+ coeff1 = highbd_calculate_dqcoeff_avx2(qcoeff1, dequant);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 16;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const int log_scale = 1;
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i zbin, round, quant, dequant, shift;
+ __m256i coeff0, qcoeff0, coeff1, qcoeff1;
+ __m256i cmp_mask, mask0 = zero, mask1 = zero;
+ __m128i temp_mask0, temp_mask1;
+ const __m256i one = _mm256_set1_epi32(1);
+ const __m256i log_scale_vec = _mm256_set1_epi32(log_scale);
+ int prescan_add[2];
+ int thresh[2];
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ __m256i threshold[2];
+ threshold[0] = _mm256_set1_epi32(thresh[0]);
+ threshold[1] = _mm256_set1_epi32(thresh[1]);
+ threshold[0] = _mm256_blend_epi32(threshold[0], threshold[1], 0xfe);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+
+ // Setup global values.
+ zbin = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)zbin_ptr));
+ round = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)round_ptr));
+ quant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_ptr));
+ dequant = _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)dequant_ptr));
+ shift =
+ _mm256_cvtepi16_epi32(_mm_load_si128((const __m128i *)quant_shift_ptr));
+
+ // Shift with rounding.
+ zbin = _mm256_add_epi32(zbin, log_scale_vec);
+ round = _mm256_add_epi32(round, log_scale_vec);
+ zbin = _mm256_srli_epi32(zbin, log_scale);
+ round = _mm256_srli_epi32(round, log_scale);
+ zbin = _mm256_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0,
+ &mask0);
+ __m256i temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm256_permute2x128_si256(zbin, zbin, 0x11);
+ __m256i temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan, &is_found1, &mask1);
+ threshold[0] = threshold[1];
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), zero);
+ round = _mm256_permute2x128_si256(round, round, 0x11);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+ shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+ } else {
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ round = _mm256_permute2x128_si256(round, round, 0x11);
+ quant = _mm256_permute2x128_si256(quant, quant, 0x11);
+ shift = _mm256_permute2x128_si256(shift, shift, 0x11);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ // Reinsert signs
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr);
+ coeff0 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+ dequant = _mm256_permute2x128_si256(dequant, dequant, 0x11);
+ coeff1 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm256_load_si256((__m256i *)(coeff_ptr + index));
+ qcoeff0 = _mm256_abs_epi32(coeff0);
+ coeff1 = _mm256_load_si256((__m256i *)(coeff_ptr + index + 8));
+ qcoeff1 = _mm256_abs_epi32(coeff1);
+ highbd_update_mask0_avx2(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+ temp0 = _mm256_cmpgt_epi32(qcoeff0, zbin);
+ temp1 = _mm256_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm256_permute4x64_epi64(_mm256_packs_epi32(temp0, temp1), 0xd8);
+ highbd_update_mask1_avx2(&cmp_mask, iscan + index, &is_found1, &mask1);
+ if (_mm256_movemask_epi8(cmp_mask) == 0) {
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + index + 8), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index), zero);
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + index + 8), zero);
+ index += 16;
+ continue;
+ }
+ highbd_calculate_qcoeff_avx2(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff_avx2(&qcoeff1, &round, &quant, &shift, &log_scale);
+ qcoeff0 = _mm256_sign_epi32(qcoeff0, coeff0);
+ qcoeff1 = _mm256_sign_epi32(qcoeff1, coeff1);
+ qcoeff0 = _mm256_and_si256(qcoeff0, temp0);
+ qcoeff1 = _mm256_and_si256(qcoeff1, temp1);
+ highbd_store_coefficients_avx2(qcoeff0, qcoeff1, qcoeff_ptr + index);
+ coeff0 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff0, dequant, log_scale);
+ coeff1 =
+ highbd_calculate_dqcoeff_log_scale_avx2(qcoeff1, dequant, log_scale);
+ highbd_store_coefficients_avx2(coeff0, coeff1, dqcoeff_ptr + index);
+ index += 16;
+ }
+ if (is_found0) {
+ temp_mask0 = _mm_max_epi16(_mm256_castsi256_si128(mask0),
+ _mm256_extracti128_si256(mask0, 1));
+ non_zero_count = calculate_non_zero_count(temp_mask0);
+ }
+ if (is_found1) {
+ temp_mask1 = _mm_max_epi16(_mm256_castsi256_si128(mask1),
+ _mm256_extracti128_si256(mask1, 1));
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(temp_mask1);
+ }
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
new file mode 100644
index 0000000000..ae31116e9d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_adaptive_quantize_sse2.c
@@ -0,0 +1,732 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE __m128i highbd_invert_sign_64bit_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi64(a, sign);
+}
+
+static INLINE void highbd_mul_shift_sse2(const __m128i *x, const __m128i *y,
+ __m128i *p, const int shift) {
+ __m128i sign = _mm_srai_epi32(*y, 31);
+ __m128i sign_lo = _mm_unpacklo_epi32(sign, sign);
+ __m128i sign_hi = _mm_unpackhi_epi32(sign, sign);
+ __m128i abs_y = invert_sign_32_sse2(*y, sign);
+ __m128i prod_lo = _mm_mul_epu32(*x, abs_y);
+ __m128i prod_hi = _mm_srli_epi64(*x, 32);
+ const __m128i mult_hi = _mm_srli_epi64(abs_y, 32);
+ prod_hi = _mm_mul_epu32(prod_hi, mult_hi);
+ prod_lo = highbd_invert_sign_64bit_sse2(prod_lo, sign_lo);
+ prod_hi = highbd_invert_sign_64bit_sse2(prod_hi, sign_hi);
+
+ prod_lo = _mm_srli_epi64(prod_lo, shift);
+ const __m128i mask = _mm_set_epi32(0, -1, 0, -1);
+ prod_lo = _mm_and_si128(prod_lo, mask);
+ prod_hi = _mm_srli_epi64(prod_hi, shift);
+
+ prod_hi = _mm_slli_epi64(prod_hi, 32);
+ *p = _mm_or_si128(prod_lo, prod_hi);
+}
+
+static INLINE void highbd_calculate_qcoeff(__m128i *coeff, const __m128i *round,
+ const __m128i *quant,
+ const __m128i *shift,
+ const int *log_scale) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_add_epi32(*coeff, *round);
+ highbd_mul_shift_sse2(&qcoeff, quant, &tmp, 16);
+ qcoeff = _mm_add_epi32(tmp, qcoeff);
+ highbd_mul_shift_sse2(&qcoeff, shift, coeff, 16 - *log_scale);
+}
+
+static INLINE void highbd_update_mask1(__m128i *cmp_mask0,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i temp_mask = _mm_setzero_si128();
+ if (_mm_movemask_epi8(*cmp_mask0)) {
+ __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+ temp_mask = mask0;
+ *is_found = 1;
+ }
+ *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void highbd_update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+ __m128i *threshold,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i coeff[2], cmp_mask0, cmp_mask1;
+
+ coeff[0] = _mm_slli_epi32(*qcoeff0, AOM_QM_BITS);
+ cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm_slli_epi32(*qcoeff1, AOM_QM_BITS);
+ cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+
+ cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+
+ highbd_update_mask1(&cmp_mask0, iscan_ptr, is_found, mask);
+}
+
+static INLINE __m128i highbd_calculate_dqcoeff(__m128i qcoeff, __m128i dequant,
+ const int log_scale) {
+ __m128i coeff_sign = _mm_srai_epi32(qcoeff, 31);
+ __m128i abs_coeff = invert_sign_32_sse2(qcoeff, coeff_sign);
+ highbd_mul_shift_sse2(&abs_coeff, &dequant, &abs_coeff, log_scale);
+ return invert_sign_32_sse2(abs_coeff, coeff_sign);
+}
+
+void aom_highbd_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 0;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_32x32_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 1;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi32(zbin, log_scale_vec);
+ round = _mm_add_epi32(round, log_scale_vec);
+ zbin = _mm_srli_epi32(zbin, log_scale);
+ round = _mm_srli_epi32(round, log_scale);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
+
+void aom_highbd_quantize_b_64x64_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int index = 8;
+ const int log_scale = 2;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi32(1);
+ const __m128i log_scale_vec = _mm_set1_epi32(log_scale);
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, cmp_mask;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ int prescan_add[2];
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
+ }
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ __m128i zbin_sign = _mm_srai_epi16(zbin, 15);
+ __m128i round_sign = _mm_srai_epi16(round, 15);
+ __m128i quant_sign = _mm_srai_epi16(quant, 15);
+ __m128i dequant_sign = _mm_srai_epi16(dequant, 15);
+ __m128i shift_sign = _mm_srai_epi16(shift, 15);
+
+ zbin = _mm_unpacklo_epi16(zbin, zbin_sign);
+ round = _mm_unpacklo_epi16(round, round_sign);
+ quant = _mm_unpacklo_epi16(quant, quant_sign);
+ dequant = _mm_unpacklo_epi16(dequant, dequant_sign);
+ shift = _mm_unpacklo_epi16(shift, shift_sign);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi32(zbin, log_scale_vec);
+ round = _mm_add_epi32(round, log_scale_vec);
+ zbin = _mm_srli_epi32(zbin, log_scale);
+ round = _mm_srli_epi32(round, log_scale);
+ zbin = _mm_sub_epi32(zbin, one);
+
+ // Do DC and first 15 AC.
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), coeff1);
+ }
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = _mm_load_si128((__m128i *)(coeff_ptr + index));
+ coeff1 = _mm_load_si128((__m128i *)(coeff_ptr + index + 4));
+
+ coeff0_sign = _mm_srai_epi32(coeff0, 31);
+ coeff1_sign = _mm_srai_epi32(coeff1, 31);
+ qcoeff0 = invert_sign_32_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(coeff1, coeff1_sign);
+
+ highbd_update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index,
+ &is_found0, &mask0);
+
+ cmp_mask0 = _mm_cmpgt_epi32(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi32(qcoeff1, zbin);
+ cmp_mask = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ highbd_update_mask1(&cmp_mask, iscan + index, &is_found1, &mask1);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ index += 8;
+ continue;
+ }
+ highbd_calculate_qcoeff(&qcoeff0, &round, &quant, &shift, &log_scale);
+ highbd_calculate_qcoeff(&qcoeff1, &round, &quant, &shift, &log_scale);
+
+ qcoeff0 = invert_sign_32_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_32_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), qcoeff0);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), qcoeff1);
+
+ coeff0 = highbd_calculate_dqcoeff(qcoeff0, dequant, log_scale);
+ coeff1 = highbd_calculate_dqcoeff(qcoeff1, dequant, log_scale);
+
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), coeff0);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), coeff1);
+
+ index += 8;
+ }
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
+
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
new file mode 100644
index 0000000000..11e45778c0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_avx2.c
@@ -0,0 +1,1248 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve.h"
+#include "aom_dsp/x86/convolve_avx2.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// -----------------------------------------------------------------------------
+// Copy and average
+
+static const uint8_t ip_shuffle_f2f3[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+static const uint8_t ip_shuffle_f4f5[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd);
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd);
+
+void av1_highbd_convolve_y_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ if (filter_params_y->taps == 12) {
+ av1_highbd_convolve_y_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_y, subpel_y_qn, bd);
+ return;
+ }
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+
+ __m256i s[8], coeffs_y[4];
+
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m256i src6;
+ __m256i s01 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ 0x20);
+ __m256i s12 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ 0x20);
+ __m256i s23 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ 0x20);
+ __m256i s34 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ 0x20);
+ __m256i s45 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ 0x20);
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+ __m256i s56 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+ src6, 0x20);
+
+ s[0] = _mm256_unpacklo_epi16(s01, s12);
+ s[1] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpacklo_epi16(s45, s56);
+
+ s[4] = _mm256_unpackhi_epi16(s01, s12);
+ s[5] = _mm256_unpackhi_epi16(s23, s34);
+ s[6] = _mm256_unpackhi_epi16(s45, s56);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ const __m256i s67 = _mm256_permute2x128_si256(
+ src6,
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ 0x20);
+
+ src6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+
+ const __m256i s78 = _mm256_permute2x128_si256(
+ _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+ src6, 0x20);
+
+ s[3] = _mm256_unpacklo_epi16(s67, s78);
+ s[7] = _mm256_unpackhi_epi16(s67, s78);
+
+ const __m256i res_a = convolve(s, coeffs_y);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m256i res_b = convolve(s + 4, coeffs_y);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_16bit, 1));
+ } else if (w == 4) {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ } else {
+ res_a_round = _mm256_packs_epi32(res_a_round, res_a_round);
+ res_a_round = _mm256_min_epi16(res_a_round, clip_pixel);
+ res_a_round = _mm256_max_epi16(res_a_round, zero);
+
+ xx_storel_32(&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res_a_round));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res_a_round, 1));
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_x_sr_avx2(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ if (filter_params_x->taps == 12) {
+ av1_highbd_convolve_x_sr_ssse3(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, subpel_x_qn, conv_params,
+ bd);
+ return;
+ }
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ __m256i s[4], coeffs_x[4];
+
+ const __m256i round_const_x =
+ _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const int bits = FILTER_BITS - conv_params->round_0;
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ assert(bits >= 0);
+ assert((FILTER_BITS - conv_params->round_1) >= 0 ||
+ ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS));
+
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ for (i = 0; i < h; i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+ const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 0);
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+ s[2] = _mm256_alignr_epi8(r1, r0, 8);
+ s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+ __m256i res_even = convolve(s, coeffs_x);
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+ s[2] = _mm256_alignr_epi8(r1, r0, 10);
+ s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+ __m256i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+ res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
+ __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
+
+ __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res));
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ } else {
+ xx_storel_32(&dst[i * dst_stride + j], _mm256_castsi256_si128(res));
+ xx_storel_32(&dst[i * dst_stride + j + dst_stride],
+ _mm256_extracti128_si256(res, 1));
+ }
+ }
+ }
+}
+
+#define CONV8_ROUNDING_BITS (7)
+
+// -----------------------------------------------------------------------------
+// Horizontal and vertical filtering
+
+static const uint8_t signal_pattern_0[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+static const uint8_t signal_pattern_1[32] = { 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13,
+ 4, 5, 6, 7, 6, 7, 8, 9,
+ 8, 9, 10, 11, 10, 11, 12, 13 };
+
+static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15,
+ 6, 7, 8, 9, 8, 9, 10, 11,
+ 10, 11, 12, 13, 12, 13, 14, 15 };
+
+static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 };
+
+// -----------------------------------------------------------------------------
+// Horizontal Filtering
+
+static INLINE void pack_pixels(const __m256i *s, __m256i *p /*p[4]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf0 = _mm256_loadu_si256((const __m256i *)signal_pattern_0);
+ const __m256i sf1 = _mm256_loadu_si256((const __m256i *)signal_pattern_1);
+ const __m256i c = _mm256_permutevar8x32_epi32(*s, idx);
+
+ p[0] = _mm256_shuffle_epi8(*s, sf0); // x0x6
+ p[1] = _mm256_shuffle_epi8(*s, sf1); // x1x7
+ p[2] = _mm256_shuffle_epi8(c, sf0); // x2x4
+ p[3] = _mm256_shuffle_epi8(c, sf1); // x3x5
+}
+
+// Note:
+// Shared by 8x2 and 16x1 block
+static INLINE void pack_16_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *x /*x[8]*/) {
+ __m256i pp[8];
+ pack_pixels(s0, pp);
+ pack_pixels(s1, &pp[4]);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[4], 0x20);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[5], 0x20);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[6], 0x20);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[7], 0x20);
+ x[4] = x[2];
+ x[5] = x[3];
+ x[6] = _mm256_permute2x128_si256(pp[0], pp[4], 0x31);
+ x[7] = _mm256_permute2x128_si256(pp[1], pp[5], 0x31);
+}
+
+static INLINE void pack_8x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i pp[8];
+ __m256i s0;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ pack_pixels(&s0, pp);
+ x[0] = _mm256_permute2x128_si256(pp[0], pp[2], 0x30);
+ x[1] = _mm256_permute2x128_si256(pp[1], pp[3], 0x30);
+ x[2] = _mm256_permute2x128_si256(pp[2], pp[0], 0x30);
+ x[3] = _mm256_permute2x128_si256(pp[3], pp[1], 0x30);
+}
+
+static INLINE void pack_8x2_pixels(const uint16_t *src, ptrdiff_t stride,
+ __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + stride));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+static INLINE void pack_16x1_pixels(const uint16_t *src, __m256i *x) {
+ __m256i s0, s1;
+ s0 = _mm256_loadu_si256((const __m256i *)src);
+ s1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_pixels(&s0, &s1, x);
+}
+
+// Note:
+// Shared by horizontal and vertical filtering
+static INLINE void pack_filters(const int16_t *filter, __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p0 = _mm256_set1_epi32(0x03020100);
+ const __m256i p1 = _mm256_set1_epi32(0x07060504);
+ const __m256i p2 = _mm256_set1_epi32(0x0b0a0908);
+ const __m256i p3 = _mm256_set1_epi32(0x0f0e0d0c);
+ f[0] = _mm256_shuffle_epi8(hh, p0);
+ f[1] = _mm256_shuffle_epi8(hh, p1);
+ f[2] = _mm256_shuffle_epi8(hh, p2);
+ f[3] = _mm256_shuffle_epi8(hh, p3);
+}
+
+static INLINE void pack_filters_4tap(const int16_t *filter,
+ __m256i *f /*f[4]*/) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i coeff = _mm256_broadcastsi128_si256(h);
+
+ // coeffs 2 3 2 3 2 3 2 3
+ f[0] = _mm256_shuffle_epi32(coeff, 0x55);
+ // coeffs 4 5 4 5 4 5 4 5
+ f[1] = _mm256_shuffle_epi32(coeff, 0xaa);
+}
+
+static INLINE void filter_8x1_pixels(const __m256i *sig /*sig[4]*/,
+ const __m256i *fil /*fil[4]*/,
+ __m256i *y) {
+ __m256i a, a0, a1;
+
+ a0 = _mm256_madd_epi16(fil[0], sig[0]);
+ a1 = _mm256_madd_epi16(fil[3], sig[3]);
+ a = _mm256_add_epi32(a0, a1);
+
+ a0 = _mm256_madd_epi16(fil[1], sig[1]);
+ a1 = _mm256_madd_epi16(fil[2], sig[2]);
+
+ {
+ const __m256i min = _mm256_min_epi32(a0, a1);
+ a = _mm256_add_epi32(a, min);
+ }
+ {
+ const __m256i max = _mm256_max_epi32(a0, a1);
+ a = _mm256_add_epi32(a, max);
+ }
+ {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ a = _mm256_add_epi32(a, rounding);
+ *y = _mm256_srai_epi32(a, CONV8_ROUNDING_BITS);
+ }
+}
+
+static INLINE void store_8x1_pixels(const __m256i *y, const __m256i *mask,
+ uint16_t *dst) {
+ const __m128i a0 = _mm256_castsi256_si128(*y);
+ const __m128i a1 = _mm256_extractf128_si256(*y, 1);
+ __m128i res = _mm_packus_epi32(a0, a1);
+ res = _mm_min_epi16(res, _mm256_castsi256_si128(*mask));
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static INLINE void store_8x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(a));
+ _mm_storeu_si128((__m128i *)(dst + pitch), _mm256_extractf128_si256(a, 1));
+}
+
+static INLINE void store_16x1_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst) {
+ __m256i a = _mm256_packus_epi32(*y0, *y1);
+ a = _mm256_min_epi16(a, *mask);
+ _mm256_storeu_si256((__m256i *)dst, a);
+}
+
+static void aom_highbd_filter_block1d8_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_pixels(src_ptr, src_pitch, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void aom_highbd_filter_block1d16_h8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[8], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_pixels(src_ptr, signal);
+ filter_8x1_pixels(signal, ff, &res0);
+ filter_8x1_pixels(&signal[4], ff, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i ff[2], s[2];
+ uint32_t i;
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ static const uint8_t shuffle_mask[32] = { 0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6,
+ 7, 6, 7, 8, 9, 0, 1, 2, 3, 2, 3,
+ 4, 5, 4, 5, 6, 7, 6, 7, 8, 9 };
+
+ __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+ __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+ __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+ pack_filters_4tap(filter, ff);
+ src_ptr -= 3;
+ for (i = 0; i <= (height - 2); i += 2) {
+ __m256i row0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+ __m256i row1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[(i + 1) * src_pitch + 2]));
+
+ s[0] = _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+ s[1] = _mm256_alignr_epi8(s[0], s[0], 4);
+
+ s[0] = _mm256_shuffle_epi8(s[0], mask);
+ s[1] = _mm256_shuffle_epi8(s[1], mask);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst_ptr[(i + 1) * dst_pitch],
+ _mm256_extracti128_si256(res, 1));
+ }
+ if (height % 2 != 0) {
+ i = height - 1;
+ const __m256i row0_0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 2]));
+ const __m256i row0_1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_pitch + 6]));
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+ s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+ s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ }
+}
+
+static void aom_highbd_filter_block1d8_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i ff[2], s[2];
+ uint32_t i = 0;
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+
+ static const uint8_t shuffle_mask[32] = { 0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15,
+ 0, 1, 8, 9, 2, 3, 10, 11,
+ 4, 5, 12, 13, 6, 7, 14, 15 };
+
+ __m256i mask = _mm256_loadu_si256((__m256i *)shuffle_mask);
+ __m256i ip_mask_f2f3 = _mm256_loadu_si256((__m256i *)ip_shuffle_f2f3);
+ __m256i ip_mask_f4f5 = _mm256_loadu_si256((__m256i *)ip_shuffle_f4f5);
+
+ pack_filters_4tap(filter, ff);
+ src_ptr -= 3;
+
+ /* Horizontal filter */
+
+ for (i = 0; i <= (height - 2); i += 2) {
+ const __m256i row0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+ __m256i row1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_pitch + 2]);
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0, _mm256_castsi256_si128(row1), 1);
+ const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+ // even pixels
+ s[0] = r0;
+ s[1] = _mm256_alignr_epi8(r1, r0, 4);
+
+ __m256i res_even = convolve_4tap(s, ff);
+ res_even = _mm256_srai_epi32(_mm256_add_epi32(res_even, rounding),
+ CONV8_ROUNDING_BITS);
+
+ // odd pixels
+ s[0] = _mm256_alignr_epi8(r1, r0, 2);
+ s[1] = _mm256_alignr_epi8(r1, r0, 6);
+
+ __m256i res_odd = convolve_4tap(s, ff);
+ res_odd = _mm256_srai_epi32(_mm256_add_epi32(res_odd, rounding),
+ CONV8_ROUNDING_BITS);
+
+ __m256i res = _mm256_packs_epi32(res_even, res_odd);
+ res = _mm256_shuffle_epi8(res, mask);
+
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res, 1));
+ }
+
+ if (height % 2 != 0) {
+ i = height - 1;
+ const __m256i row0_0 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 2]);
+ const __m256i row0_1 =
+ _mm256_loadu_si256((__m256i *)&src_ptr[i * src_pitch + 6]);
+
+ const __m256i r0 =
+ _mm256_inserti128_si256(row0_0, _mm256_castsi256_si128(row0_1), 1);
+
+ s[0] = _mm256_shuffle_epi8(r0, ip_mask_f2f3);
+ s[1] = _mm256_shuffle_epi8(r0, ip_mask_f4f5);
+
+ __m256i res = convolve_4tap(s, ff);
+ res =
+ _mm256_srai_epi32(_mm256_add_epi32(res, rounding), CONV8_ROUNDING_BITS);
+
+ res = _mm256_packs_epi32(res, res);
+ res = _mm256_min_epi16(res, clip_pixel);
+ res = _mm256_max_epi16(res, zero);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res));
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + 4],
+ _mm256_extracti128_si256(res, 1));
+ }
+}
+
+static void aom_highbd_filter_block1d16_h4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_h4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+ dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap horizontal filtering
+
+static INLINE void pack_2t_filter(const int16_t *filter, __m256i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m256i hh = _mm256_insertf128_si256(_mm256_castsi128_si256(h), h, 1);
+ const __m256i p = _mm256_set1_epi32(0x09080706);
+ f[0] = _mm256_shuffle_epi8(hh, p);
+}
+
+// can be used by pack_8x2_2t_pixels() and pack_16x1_2t_pixels()
+// the difference is s0/s1 specifies first and second rows or,
+// first 16 samples and 8-sample shifted 16 samples
+static INLINE void pack_16_2t_pixels(const __m256i *s0, const __m256i *s1,
+ __m256i *sig) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i x0 = _mm256_shuffle_epi8(*s0, sf2);
+ __m256i x1 = _mm256_shuffle_epi8(*s1, sf2);
+ __m256i r0 = _mm256_permutevar8x32_epi32(*s0, idx);
+ __m256i r1 = _mm256_permutevar8x32_epi32(*s1, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ r1 = _mm256_shuffle_epi8(r1, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, x1, 0x20);
+ sig[1] = _mm256_permute2x128_si256(r0, r1, 0x20);
+}
+
+static INLINE void pack_8x2_2t_pixels(const uint16_t *src,
+ const ptrdiff_t pitch, __m256i *sig) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_16x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i r1 = _mm256_loadu_si256((const __m256i *)(src + 8));
+ pack_16_2t_pixels(&r0, &r1, sig);
+}
+
+static INLINE void pack_8x1_2t_pixels(const uint16_t *src,
+ __m256i *sig /*sig[2]*/) {
+ const __m256i idx = _mm256_loadu_si256((const __m256i *)signal_index);
+ const __m256i sf2 = _mm256_loadu_si256((const __m256i *)signal_pattern_2);
+ __m256i r0 = _mm256_loadu_si256((const __m256i *)src);
+ __m256i x0 = _mm256_shuffle_epi8(r0, sf2);
+ r0 = _mm256_permutevar8x32_epi32(r0, idx);
+ r0 = _mm256_shuffle_epi8(r0, sf2);
+ sig[0] = _mm256_permute2x128_si256(x0, r0, 0x20);
+}
+
+// can be used by filter_8x2_2t_pixels() and filter_16x1_2t_pixels()
+static INLINE void filter_16_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ __m256i x1 = _mm256_madd_epi16(sig[1], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ x1 = _mm256_add_epi32(x1, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm256_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void filter_8x1_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0) {
+ const __m256i rounding = _mm256_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m256i x0 = _mm256_madd_epi16(sig[0], *f);
+ x0 = _mm256_add_epi32(x0, rounding);
+ *y0 = _mm256_srai_epi32(x0, CONV8_ROUNDING_BITS);
+}
+
+static void aom_highbd_filter_block1d8_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_8x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ height -= 2;
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ } while (height > 1);
+
+ if (height > 0) {
+ pack_8x1_2t_pixels(src_ptr, signal);
+ filter_8x1_2t_pixels(signal, &ff, &res0);
+ store_8x1_pixels(&res0, &max, dst_ptr);
+ }
+}
+
+static void aom_highbd_filter_block1d16_h2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[2], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff;
+ pack_2t_filter(filter, &ff);
+
+ src_ptr -= 3;
+ do {
+ pack_16x1_2t_pixels(src_ptr, signal);
+ filter_16_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+ height -= 1;
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ } while (height > 0);
+}
+
+// -----------------------------------------------------------------------------
+// Vertical Filtering
+
+static void pack_8x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i s0 = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)src));
+ __m256i s1 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src + pitch)));
+ __m256i s2 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 2 * pitch)));
+ __m256i s3 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 3 * pitch)));
+ __m256i s4 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 4 * pitch)));
+ __m256i s5 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 5 * pitch)));
+ __m256i s6 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 6 * pitch)));
+
+ s0 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ s1 = _mm256_inserti128_si256(s1, _mm256_castsi256_si128(s2), 1);
+ s2 = _mm256_inserti128_si256(s2, _mm256_castsi256_si128(s3), 1);
+ s3 = _mm256_inserti128_si256(s3, _mm256_castsi256_si128(s4), 1);
+ s4 = _mm256_inserti128_si256(s4, _mm256_castsi256_si128(s5), 1);
+ s5 = _mm256_inserti128_si256(s5, _mm256_castsi256_si128(s6), 1);
+
+ sig[0] = _mm256_unpacklo_epi16(s0, s1);
+ sig[4] = _mm256_unpackhi_epi16(s0, s1);
+ sig[1] = _mm256_unpacklo_epi16(s2, s3);
+ sig[5] = _mm256_unpackhi_epi16(s2, s3);
+ sig[2] = _mm256_unpacklo_epi16(s4, s5);
+ sig[6] = _mm256_unpackhi_epi16(s4, s5);
+ sig[8] = s6;
+}
+
+static INLINE void pack_8x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ __m256i s0 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 7 * pitch)));
+ // base + 8th row
+ __m256i s1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(src + 8 * pitch)));
+ __m256i s2 = _mm256_inserti128_si256(sig[8], _mm256_castsi256_si128(s0), 1);
+ __m256i s3 = _mm256_inserti128_si256(s0, _mm256_castsi256_si128(s1), 1);
+ sig[3] = _mm256_unpacklo_epi16(s2, s3);
+ sig[7] = _mm256_unpackhi_epi16(s2, s3);
+ sig[8] = s1;
+}
+
+static INLINE void filter_8x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_8x1_pixels(sig, f, y0);
+ filter_8x1_pixels(&sig[4], f, y1);
+}
+
+static INLINE void update_pixels(__m256i *sig) {
+ int i;
+ for (i = 0; i < 3; ++i) {
+ sig[i] = sig[i + 1];
+ sig[i + 4] = sig[i + 5];
+ }
+}
+
+static void aom_highbd_filter_block1d8_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[9], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_8x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_8x9_pixels(src_ptr, src_pitch, signal);
+
+ filter_8x9_pixels(signal, ff, &res0, &res1);
+ store_8x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void pack_16x9_init(const uint16_t *src, ptrdiff_t pitch, __m256i *sig) {
+ __m256i u0, u1, u2, u3;
+ // load 0-6 rows
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src + 2 * pitch));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src + 3 * pitch));
+ const __m256i s4 = _mm256_loadu_si256((const __m256i *)(src + 4 * pitch));
+ const __m256i s5 = _mm256_loadu_si256((const __m256i *)(src + 5 * pitch));
+ const __m256i s6 = _mm256_loadu_si256((const __m256i *)(src + 6 * pitch));
+
+ u0 = _mm256_permute2x128_si256(s0, s1, 0x20); // 0, 1 low
+ u1 = _mm256_permute2x128_si256(s0, s1, 0x31); // 0, 1 high
+
+ u2 = _mm256_permute2x128_si256(s1, s2, 0x20); // 1, 2 low
+ u3 = _mm256_permute2x128_si256(s1, s2, 0x31); // 1, 2 high
+
+ sig[0] = _mm256_unpacklo_epi16(u0, u2);
+ sig[4] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[8] = _mm256_unpacklo_epi16(u1, u3);
+ sig[12] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s2, s3, 0x20);
+ u1 = _mm256_permute2x128_si256(s2, s3, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s3, s4, 0x20);
+ u3 = _mm256_permute2x128_si256(s3, s4, 0x31);
+
+ sig[1] = _mm256_unpacklo_epi16(u0, u2);
+ sig[5] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[9] = _mm256_unpacklo_epi16(u1, u3);
+ sig[13] = _mm256_unpackhi_epi16(u1, u3);
+
+ u0 = _mm256_permute2x128_si256(s4, s5, 0x20);
+ u1 = _mm256_permute2x128_si256(s4, s5, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s5, s6, 0x20);
+ u3 = _mm256_permute2x128_si256(s5, s6, 0x31);
+
+ sig[2] = _mm256_unpacklo_epi16(u0, u2);
+ sig[6] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[10] = _mm256_unpacklo_epi16(u1, u3);
+ sig[14] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s6;
+}
+
+static void pack_16x9_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // base + 7th row
+ const __m256i s7 = _mm256_loadu_si256((const __m256i *)(src + 7 * pitch));
+ // base + 8th row
+ const __m256i s8 = _mm256_loadu_si256((const __m256i *)(src + 8 * pitch));
+
+ __m256i u0, u1, u2, u3;
+ u0 = _mm256_permute2x128_si256(sig[16], s7, 0x20);
+ u1 = _mm256_permute2x128_si256(sig[16], s7, 0x31);
+
+ u2 = _mm256_permute2x128_si256(s7, s8, 0x20);
+ u3 = _mm256_permute2x128_si256(s7, s8, 0x31);
+
+ sig[3] = _mm256_unpacklo_epi16(u0, u2);
+ sig[7] = _mm256_unpackhi_epi16(u0, u2);
+
+ sig[11] = _mm256_unpacklo_epi16(u1, u3);
+ sig[15] = _mm256_unpackhi_epi16(u1, u3);
+
+ sig[16] = s8;
+}
+
+static INLINE void filter_16x9_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ __m256i res[4];
+ int i;
+ for (i = 0; i < 4; ++i) {
+ filter_8x1_pixels(&sig[i << 2], f, &res[i]);
+ }
+
+ {
+ const __m256i l0l1 = _mm256_packus_epi32(res[0], res[1]);
+ const __m256i h0h1 = _mm256_packus_epi32(res[2], res[3]);
+ *y0 = _mm256_permute2x128_si256(l0l1, h0h1, 0x20);
+ *y1 = _mm256_permute2x128_si256(l0l1, h0h1, 0x31);
+ }
+}
+
+static INLINE void store_16x2_pixels(const __m256i *y0, const __m256i *y1,
+ const __m256i *mask, uint16_t *dst,
+ ptrdiff_t pitch) {
+ __m256i p = _mm256_min_epi16(*y0, *mask);
+ _mm256_storeu_si256((__m256i *)dst, p);
+ p = _mm256_min_epi16(*y1, *mask);
+ _mm256_storeu_si256((__m256i *)(dst + pitch), p);
+}
+
+static void update_16x9_pixels(__m256i *sig) {
+ update_pixels(&sig[0]);
+ update_pixels(&sig[8]);
+}
+
+static void aom_highbd_filter_block1d16_v8_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[17], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+
+ __m256i ff[4];
+ pack_filters(filter, ff);
+
+ pack_16x9_init(src_ptr, src_pitch, signal);
+
+ do {
+ pack_16x9_pixels(src_ptr, src_pitch, signal);
+ filter_16x9_pixels(signal, ff, &res0, &res1);
+ store_16x2_pixels(&res0, &res1, &max, dst_ptr, dst_pitch);
+ update_16x9_pixels(signal);
+
+ src_ptr += src_pitch << 1;
+ dst_ptr += dst_pitch << 1;
+ height -= 2;
+ } while (height > 0);
+}
+
+static void aom_highbd_filter_block1d4_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi32(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+ uint32_t i;
+ __m256i s[2], ff[2];
+
+ pack_filters_4tap(filter, ff);
+
+ const uint16_t *data = src_ptr;
+ /* Vertical filter */
+ {
+ __m128i s2 = _mm_loadl_epi64((__m128i *)(data + 2 * src_pitch));
+ __m128i s3 = _mm_loadl_epi64((__m128i *)(data + 3 * src_pitch));
+
+ __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+ __m128i s4 = _mm_loadl_epi64((__m128i *)(data + 4 * src_pitch));
+
+ __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+ s[0] = _mm256_unpacklo_epi16(s23, s34);
+
+ for (i = 0; i < height; i += 2) {
+ data = &src_ptr[i * src_pitch];
+
+ __m128i s5 = _mm_loadl_epi64((__m128i *)(data + 5 * src_pitch));
+ __m128i s6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_pitch));
+
+ __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+ __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+ s[1] = _mm256_unpacklo_epi16(s45, s56);
+
+ const __m256i res_a = convolve_4tap(s, ff);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_min_epi32(res_a_round, clip_pixel);
+ res_16bit = _mm256_max_epi32(res_16bit, zero);
+ res_16bit = _mm256_packs_epi32(res_16bit, res_16bit);
+
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storel_epi64((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res_16bit, 1));
+
+ s[0] = s[1];
+ s4 = s6;
+ }
+ }
+}
+
+static void aom_highbd_filter_block1d8_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
+ const __m256i clip_pixel =
+ _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i s[4], ff[2];
+ uint32_t i;
+ pack_filters_4tap(filter, ff);
+
+ const uint16_t *data = src_ptr;
+ /* Vertical filter */
+ {
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_pitch));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_pitch));
+
+ __m256i s23 = _mm256_inserti128_si256(_mm256_castsi128_si256(s2), s3, 1);
+
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_pitch));
+
+ __m256i s34 = _mm256_inserti128_si256(_mm256_castsi128_si256(s3), s4, 1);
+
+ s[0] = _mm256_unpacklo_epi16(s23, s34);
+ s[2] = _mm256_unpackhi_epi16(s23, s34);
+
+ for (i = 0; i < height; i += 2) {
+ data = &src_ptr[i * src_pitch];
+
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_pitch));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_pitch));
+
+ __m256i s45 = _mm256_inserti128_si256(_mm256_castsi128_si256(s4), s5, 1);
+ __m256i s56 = _mm256_inserti128_si256(_mm256_castsi128_si256(s5), s6, 1);
+
+ s[1] = _mm256_unpacklo_epi16(s45, s56);
+ s[3] = _mm256_unpackhi_epi16(s45, s56);
+
+ const __m256i res_a = convolve_4tap(s, ff);
+
+ __m256i res_a_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_a, round_const_bits), round_shift_bits);
+
+ const __m256i res_b = convolve_4tap(s + 2, ff);
+ __m256i res_b_round = _mm256_sra_epi32(
+ _mm256_add_epi32(res_b, round_const_bits), round_shift_bits);
+
+ __m256i res_16bit = _mm256_packs_epi32(res_a_round, res_b_round);
+ res_16bit = _mm256_min_epi16(res_16bit, clip_pixel);
+ res_16bit = _mm256_max_epi16(res_16bit, zero);
+
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch],
+ _mm256_castsi256_si128(res_16bit));
+ _mm_storeu_si128((__m128i *)&dst_ptr[i * dst_pitch + dst_pitch],
+ _mm256_extracti128_si256(res_16bit, 1));
+
+ s[0] = s[1];
+ s[2] = s[3];
+ s4 = s6;
+ }
+ }
+}
+
+static void aom_highbd_filter_block1d16_v4_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_v4_avx2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+
+ aom_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_pitch, dst_ptr + 8,
+ dst_pitch, height, filter, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 2-tap vertical filtering
+
+static void pack_16x2_init(const uint16_t *src, __m256i *sig) {
+ sig[2] = _mm256_loadu_si256((const __m256i *)src);
+}
+
+static INLINE void pack_16x2_2t_pixels(const uint16_t *src, ptrdiff_t pitch,
+ __m256i *sig) {
+ // load the next row
+ const __m256i u = _mm256_loadu_si256((const __m256i *)(src + pitch));
+ sig[0] = _mm256_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm256_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_16x2_2t_pixels(const __m256i *sig, const __m256i *f,
+ __m256i *y0, __m256i *y1) {
+ filter_16_2t_pixels(sig, f, y0, y1);
+}
+
+static void aom_highbd_filter_block1d16_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m256i signal[3], res0, res1;
+ const __m256i max = _mm256_set1_epi16((1 << bd) - 1);
+ __m256i ff;
+
+ pack_2t_filter(filter, &ff);
+ pack_16x2_init(src_ptr, signal);
+
+ do {
+ pack_16x2_2t_pixels(src_ptr, src_pitch, signal);
+ filter_16x2_2t_pixels(signal, &ff, &res0, &res1);
+ store_16x1_pixels(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+static INLINE void pack_8x1_2t_filter(const int16_t *filter, __m128i *f) {
+ const __m128i h = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i p = _mm_set1_epi32(0x09080706);
+ f[0] = _mm_shuffle_epi8(h, p);
+}
+
+static void pack_8x2_init(const uint16_t *src, __m128i *sig) {
+ sig[2] = _mm_loadu_si128((const __m128i *)src);
+}
+
+static INLINE void pack_8x2_2t_pixels_ver(const uint16_t *src, ptrdiff_t pitch,
+ __m128i *sig) {
+ // load the next row
+ const __m128i u = _mm_loadu_si128((const __m128i *)(src + pitch));
+ sig[0] = _mm_unpacklo_epi16(sig[2], u);
+ sig[1] = _mm_unpackhi_epi16(sig[2], u);
+ sig[2] = u;
+}
+
+static INLINE void filter_8_2t_pixels(const __m128i *sig, const __m128i *f,
+ __m128i *y0, __m128i *y1) {
+ const __m128i rounding = _mm_set1_epi32(1 << (CONV8_ROUNDING_BITS - 1));
+ __m128i x0 = _mm_madd_epi16(sig[0], *f);
+ __m128i x1 = _mm_madd_epi16(sig[1], *f);
+ x0 = _mm_add_epi32(x0, rounding);
+ x1 = _mm_add_epi32(x1, rounding);
+ *y0 = _mm_srai_epi32(x0, CONV8_ROUNDING_BITS);
+ *y1 = _mm_srai_epi32(x1, CONV8_ROUNDING_BITS);
+}
+
+static INLINE void store_8x1_2t_pixels_ver(const __m128i *y0, const __m128i *y1,
+ const __m128i *mask, uint16_t *dst) {
+ __m128i res = _mm_packus_epi32(*y0, *y1);
+ res = _mm_min_epi16(res, *mask);
+ _mm_storeu_si128((__m128i *)dst, res);
+}
+
+static void aom_highbd_filter_block1d8_v2_avx2(
+ const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) {
+ __m128i signal[3], res0, res1;
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ __m128i ff;
+
+ pack_8x1_2t_filter(filter, &ff);
+ pack_8x2_init(src_ptr, signal);
+
+ do {
+ pack_8x2_2t_pixels_ver(src_ptr, src_pitch, signal);
+ filter_8_2t_pixels(signal, &ff, &res0, &res1);
+ store_8x1_2t_pixels_ver(&res0, &res1, &max, dst_ptr);
+
+ src_ptr += src_pitch;
+ dst_ptr += dst_pitch;
+ height -= 1;
+ } while (height > 0);
+}
+
+void aom_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+void aom_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *,
+ ptrdiff_t, uint32_t, const int16_t *,
+ int);
+#define aom_highbd_filter_block1d4_h8_avx2 aom_highbd_filter_block1d4_h8_sse2
+#define aom_highbd_filter_block1d4_h2_avx2 aom_highbd_filter_block1d4_h2_sse2
+#define aom_highbd_filter_block1d4_v8_avx2 aom_highbd_filter_block1d4_v8_sse2
+#define aom_highbd_filter_block1d4_v2_avx2 aom_highbd_filter_block1d4_v2_sse2
+
+HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2)
+HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2)
+
+#undef HIGHBD_FUNC
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
new file mode 100644
index 0000000000..a2bb283222
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_sse2.c
@@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve.h"
+
+// -----------------------------------------------------------------------------
+
+void aom_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg34_lo;
+ __m128i srcReg45_lo, srcReg56_lo;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_45_lo, resReg34_56_lo;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiply the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, _mm_setzero_si128());
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, _mm_setzero_si128());
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, (resReg23_45));
+ _mm_storel_epi64((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg34_lo = srcReg56_lo;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1;
+ __m128i srcReg32b1;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_23 = _mm_unpacklo_epi32(srcReg32b1, ss_3_1);
+ __m128i ss_45 = _mm_unpacklo_epi32(ss_4_1, ss_5_1);
+
+ ss_23 = _mm_madd_epi16(ss_23, secondFilters);
+ ss_45 = _mm_madd_epi16(ss_45, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(ss_23, ss_45);
+
+ // shift by 7 bit each 32 bit
+ srcRegFilt32b1_1 = _mm_add_epi32(srcRegFilt32b1_1, addFilterReg64);
+ srcRegFilt32b1_1 = _mm_srai_epi32(srcRegFilt32b1_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_storel_epi64((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
+ __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
+ __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
+ __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
+ __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
+ __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
+ __m128i resReg23_45, resReg34_56;
+ __m128i addFilterReg64, secondFilters, thirdFilters;
+ unsigned int i;
+ ptrdiff_t src_stride, dst_stride;
+
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5
+
+ // multiple the size of the source and destination stride by two
+ src_stride = src_pitch << 1;
+ dst_stride = dst_pitch << 1;
+
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg23_lo = _mm_unpacklo_epi16(srcReg2, srcReg3);
+ srcReg23_hi = _mm_unpackhi_epi16(srcReg2, srcReg3);
+
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg34_lo = _mm_unpacklo_epi16(srcReg3, srcReg4);
+ srcReg34_hi = _mm_unpackhi_epi16(srcReg3, srcReg4);
+
+ for (i = height; i > 1; i -= 2) {
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+
+ srcReg45_lo = _mm_unpacklo_epi16(srcReg4, srcReg5);
+ srcReg45_hi = _mm_unpackhi_epi16(srcReg4, srcReg5);
+
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+
+ srcReg56_lo = _mm_unpacklo_epi16(srcReg5, srcReg6);
+ srcReg56_hi = _mm_unpackhi_epi16(srcReg5, srcReg6);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_lo = _mm_madd_epi16(srcReg23_lo, secondFilters);
+ resReg34_lo = _mm_madd_epi16(srcReg34_lo, secondFilters);
+ resReg45_lo = _mm_madd_epi16(srcReg45_lo, thirdFilters);
+ resReg56_lo = _mm_madd_epi16(srcReg56_lo, thirdFilters);
+
+ resReg23_45_lo = _mm_add_epi32(resReg23_lo, resReg45_lo);
+ resReg34_56_lo = _mm_add_epi32(resReg34_lo, resReg56_lo);
+
+ // multiply 2 adjacent elements with the filter and add the result
+
+ resReg23_hi = _mm_madd_epi16(srcReg23_hi, secondFilters);
+ resReg34_hi = _mm_madd_epi16(srcReg34_hi, secondFilters);
+ resReg45_hi = _mm_madd_epi16(srcReg45_hi, thirdFilters);
+ resReg56_hi = _mm_madd_epi16(srcReg56_hi, thirdFilters);
+
+ resReg23_45_hi = _mm_add_epi32(resReg23_hi, resReg45_hi);
+ resReg34_56_hi = _mm_add_epi32(resReg34_hi, resReg56_hi);
+
+ // shift by 7 bit each 32 bit
+ resReg23_45_lo = _mm_add_epi32(resReg23_45_lo, addFilterReg64);
+ resReg34_56_lo = _mm_add_epi32(resReg34_56_lo, addFilterReg64);
+ resReg23_45_hi = _mm_add_epi32(resReg23_45_hi, addFilterReg64);
+ resReg34_56_hi = _mm_add_epi32(resReg34_56_hi, addFilterReg64);
+ resReg23_45_lo = _mm_srai_epi32(resReg23_45_lo, 7);
+ resReg34_56_lo = _mm_srai_epi32(resReg34_56_lo, 7);
+ resReg23_45_hi = _mm_srai_epi32(resReg23_45_hi, 7);
+ resReg34_56_hi = _mm_srai_epi32(resReg34_56_hi, 7);
+
+ // shrink to 16 bit each 32 bits, the first lane contain the first
+ // convolve result and the second lane contain the second convolve
+ // result
+ resReg23_45 = _mm_packs_epi32(resReg23_45_lo, resReg23_45_hi);
+ resReg34_56 = _mm_packs_epi32(resReg34_56_lo, resReg34_56_hi);
+
+ resReg23_45 = _mm_max_epi16(resReg23_45, _mm_setzero_si128());
+ resReg23_45 = _mm_min_epi16(resReg23_45, max);
+ resReg34_56 = _mm_max_epi16(resReg34_56, _mm_setzero_si128());
+ resReg34_56 = _mm_min_epi16(resReg34_56, max);
+
+ src_ptr += src_stride;
+
+ _mm_store_si128((__m128i *)dst_ptr, (resReg23_45));
+ _mm_store_si128((__m128i *)(dst_ptr + dst_pitch), (resReg34_56));
+
+ dst_ptr += dst_stride;
+
+ // save part of the registers for next strides
+ srcReg23_lo = srcReg45_lo;
+ srcReg23_hi = srcReg45_hi;
+ srcReg34_lo = srcReg56_lo;
+ srcReg34_hi = srcReg56_hi;
+ srcReg4 = srcReg6;
+ }
+}
+
+void aom_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ __m128i filtersReg;
+ __m128i addFilterReg64;
+ __m128i secondFilters, thirdFilters;
+ __m128i srcRegFilt32b1_1, srcRegFilt32b1_2;
+ __m128i srcReg32b1, srcReg32b2;
+ unsigned int i;
+ src_ptr -= 3;
+ addFilterReg64 = _mm_set1_epi32(64);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
+ const __m128i max = _mm_set1_epi16((1 << bd) - 1);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg);
+
+ secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3
+ thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5
+
+ for (i = height; i > 0; i -= 1) {
+ srcReg32b1 = _mm_loadu_si128((const __m128i *)(src_ptr + 2));
+ srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 6));
+
+ __m128i ss_4_1 = _mm_srli_si128(srcReg32b1, 4);
+ __m128i ss_4_2 = _mm_srli_si128(srcReg32b2, 4);
+ __m128i ss_4 = _mm_unpacklo_epi64(ss_4_1, ss_4_2);
+
+ __m128i d1 = _mm_madd_epi16(srcReg32b1, secondFilters);
+ __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters);
+ srcRegFilt32b1_1 = _mm_add_epi32(d1, d2);
+
+ __m128i ss_3_1 = _mm_srli_si128(srcReg32b1, 2);
+ __m128i ss_5_1 = _mm_srli_si128(srcReg32b1, 6);
+ __m128i ss_3_2 = _mm_srli_si128(srcReg32b2, 2);
+ __m128i ss_5_2 = _mm_srli_si128(srcReg32b2, 6);
+ __m128i ss_3 = _mm_unpacklo_epi64(ss_3_1, ss_3_2);
+ __m128i ss_5 = _mm_unpacklo_epi64(ss_5_1, ss_5_2);
+
+ d1 = _mm_madd_epi16(ss_3, secondFilters);
+ d2 = _mm_madd_epi16(ss_5, thirdFilters);
+ srcRegFilt32b1_2 = _mm_add_epi32(d1, d2);
+
+ __m128i res_lo_1 = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+ __m128i res_hi_1 = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2);
+
+ // shift by 7 bit each 32 bit
+ res_lo_1 = _mm_add_epi32(res_lo_1, addFilterReg64);
+ res_hi_1 = _mm_add_epi32(res_hi_1, addFilterReg64);
+ res_lo_1 = _mm_srai_epi32(res_lo_1, 7);
+ res_hi_1 = _mm_srai_epi32(res_hi_1, 7);
+
+ srcRegFilt32b1_1 = _mm_packs_epi32(res_lo_1, res_hi_1);
+
+ srcRegFilt32b1_1 = _mm_max_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
+ srcRegFilt32b1_1 = _mm_min_epi16(srcRegFilt32b1_1, max);
+
+ src_ptr += src_pitch;
+
+ _mm_store_si128((__m128i *)dst_ptr, srcRegFilt32b1_1);
+
+ dst_ptr += dst_pitch;
+ }
+}
+
+void aom_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_v4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_v4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
+
+void aom_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr,
+ ptrdiff_t src_pitch, uint16_t *dst_ptr,
+ ptrdiff_t dst_pitch, uint32_t height,
+ const int16_t *filter, int bd) {
+ aom_highbd_filter_block1d8_h4_sse2(src_ptr, src_pitch, dst_ptr, dst_pitch,
+ height, filter, bd);
+ aom_highbd_filter_block1d8_h4_sse2((src_ptr + 8), src_pitch, (dst_ptr + 8),
+ dst_pitch, height, filter, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 0000000000..31c3c31b3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,439 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "config/av1_rtcd.h"
+
+#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_y_qn, int bd) {
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_vert * src_stride;
+ const int bits = FILTER_BITS;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+ if (filter_params_y->taps == 12) {
+ __m128i s[24], coeffs_y[6];
+
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+ __m128i s9 = _mm_loadu_si128((__m128i *)(data + 9 * src_stride));
+ __m128i s10 = _mm_loadu_si128((__m128i *)(data + 10 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[4] = _mm_unpacklo_epi16(s8, s9);
+
+ s[6] = _mm_unpackhi_epi16(s0, s1);
+ s[7] = _mm_unpackhi_epi16(s2, s3);
+ s[8] = _mm_unpackhi_epi16(s4, s5);
+ s[9] = _mm_unpackhi_epi16(s6, s7);
+ s[10] = _mm_unpackhi_epi16(s8, s9);
+
+ s[12] = _mm_unpacklo_epi16(s1, s2);
+ s[13] = _mm_unpacklo_epi16(s3, s4);
+ s[14] = _mm_unpacklo_epi16(s5, s6);
+ s[15] = _mm_unpacklo_epi16(s7, s8);
+ s[16] = _mm_unpacklo_epi16(s9, s10);
+
+ s[18] = _mm_unpackhi_epi16(s1, s2);
+ s[19] = _mm_unpackhi_epi16(s3, s4);
+ s[20] = _mm_unpackhi_epi16(s5, s6);
+ s[21] = _mm_unpackhi_epi16(s7, s8);
+ s[22] = _mm_unpackhi_epi16(s9, s10);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s11 = _mm_loadu_si128((__m128i *)(data + 11 * src_stride));
+ __m128i s12 = _mm_loadu_si128((__m128i *)(data + 12 * src_stride));
+
+ s[5] = _mm_unpacklo_epi16(s10, s11);
+ s[11] = _mm_unpackhi_epi16(s10, s11);
+
+ s[17] = _mm_unpacklo_epi16(s11, s12);
+ s[23] = _mm_unpackhi_epi16(s11, s12);
+
+ const __m128i res_a0 = convolve_12tap(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve_12tap(s + 12, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve_12tap(s + 6, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve_12tap(s + 18, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+ s[3] = s[4];
+ s[4] = s[5];
+
+ s[6] = s[7];
+ s[7] = s[8];
+ s[8] = s[9];
+ s[9] = s[10];
+ s[10] = s[11];
+
+ s[12] = s[13];
+ s[13] = s[14];
+ s[14] = s[15];
+ s[15] = s[16];
+ s[16] = s[17];
+
+ s[18] = s[19];
+ s[19] = s[20];
+ s[20] = s[21];
+ s[21] = s[22];
+ s[22] = s[23];
+
+ s10 = s12;
+ }
+ }
+ } else {
+ __m128i s[16], coeffs_y[4];
+
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_y);
+
+ for (j = 0; j < w; j += 8) {
+ const uint16_t *data = &src_ptr[j];
+ /* Vertical filter */
+ {
+ __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+ __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+ __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+ __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+ __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+ __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+ __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+ s[0] = _mm_unpacklo_epi16(s0, s1);
+ s[1] = _mm_unpacklo_epi16(s2, s3);
+ s[2] = _mm_unpacklo_epi16(s4, s5);
+
+ s[4] = _mm_unpackhi_epi16(s0, s1);
+ s[5] = _mm_unpackhi_epi16(s2, s3);
+ s[6] = _mm_unpackhi_epi16(s4, s5);
+
+ s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+ s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+ s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+ s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+ s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+ s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+ for (i = 0; i < h; i += 2) {
+ data = &src_ptr[i * src_stride + j];
+
+ __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+ __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+ s[3] = _mm_unpacklo_epi16(s6, s7);
+ s[7] = _mm_unpackhi_epi16(s6, s7);
+
+ s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+ s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+ const __m128i res_a0 = convolve(s, coeffs_y);
+ __m128i res_a_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+ const __m128i res_a1 = convolve(s + 8, coeffs_y);
+ __m128i res_a_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+ if (w - j > 4) {
+ const __m128i res_b0 = convolve(s + 4, coeffs_y);
+ __m128i res_b_round0 = _mm_sra_epi32(
+ _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+ const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+ __m128i res_b_round1 = _mm_sra_epi32(
+ _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+ __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+ res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+ res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+ __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+ res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+ res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_16bit1);
+ } else if (w == 4) {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+ res_a_round1);
+ } else {
+ res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+ res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+ res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+ res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+ res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+ res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+ *((int *)(&dst[i * dst_stride + j])) =
+ _mm_cvtsi128_si32(res_a_round0);
+
+ *((int *)(&dst[i * dst_stride + j + dst_stride])) =
+ _mm_cvtsi128_si32(res_a_round1);
+ }
+
+ s[0] = s[1];
+ s[1] = s[2];
+ s[2] = s[3];
+
+ s[4] = s[5];
+ s[5] = s[6];
+ s[6] = s[7];
+
+ s[0 + 8] = s[1 + 8];
+ s[1 + 8] = s[2 + 8];
+ s[2 + 8] = s[3 + 8];
+
+ s[4 + 8] = s[5 + 8];
+ s[5 + 8] = s[6 + 8];
+ s[6 + 8] = s[7 + 8];
+
+ s6 = s8;
+ }
+ }
+ }
+ }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+ uint16_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const int subpel_x_qn,
+ ConvolveParams *conv_params, int bd) {
+ int i, j;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint16_t *const src_ptr = src - fo_horiz;
+
+ // Check that, even with 12-bit input, the intermediate values will fit
+ // into an unsigned 16-bit intermediate array.
+ assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+ const __m128i round_const_x =
+ _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+ const int bits = FILTER_BITS - conv_params->round_0;
+
+ const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+ const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i clip_pixel =
+ _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+ const __m128i zero = _mm_setzero_si128();
+
+ if (filter_params_x->taps == 12) {
+ __m128i s[6], coeffs_x[6];
+
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+ const __m128i row02 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 16)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+ s[4] = _mm_alignr_epi8(row02, row01, 0);
+ s[5] = _mm_alignr_epi8(row02, row01, 4);
+
+ __m128i res_even = convolve_12tap(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+ s[4] = _mm_alignr_epi8(row02, row01, 2);
+ s[5] = _mm_alignr_epi8(row02, row01, 6);
+
+ __m128i res_odd = convolve_12tap(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ res = _mm_min_epi16(res, clip_pixel);
+ res = _mm_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+ } else {
+ *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+ }
+ }
+ }
+ }
+ } else {
+ __m128i s[4], coeffs_x[4];
+ prepare_coeffs(filter_params_x, subpel_x_qn, coeffs_x);
+
+ for (j = 0; j < w; j += 8) {
+ /* Horizontal filter */
+ {
+ for (i = 0; i < h; i += 1) {
+ const __m128i row00 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i row01 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+ // even pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 0);
+ s[1] = _mm_alignr_epi8(row01, row00, 4);
+ s[2] = _mm_alignr_epi8(row01, row00, 8);
+ s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+ __m128i res_even = convolve(s, coeffs_x);
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+ round_shift_x);
+
+ // odd pixels
+ s[0] = _mm_alignr_epi8(row01, row00, 2);
+ s[1] = _mm_alignr_epi8(row01, row00, 6);
+ s[2] = _mm_alignr_epi8(row01, row00, 10);
+ s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+ __m128i res_odd = convolve(s, coeffs_x);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x),
+ round_shift_x);
+
+ res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+ round_shift_bits);
+ res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+ round_shift_bits);
+
+ __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+ __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+ __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+ res = _mm_min_epi16(res, clip_pixel);
+ res = _mm_max_epi16(res, zero);
+
+ if (w - j > 4) {
+ _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+ } else if (w == 4) {
+ _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+ } else {
+ *((int *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..91b3d126ca
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_asm_sse2.asm
@@ -0,0 +1,259 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 4 dd 16
+pw_32: times 4 dd 32
+
+SECTION .text
+INIT_XMM sse2
+cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ paddw m0, m2
+ pshuflw m1, m0, 0xe
+ paddw m0, m1
+ pshuflw m1, m0, 0x1
+ paddw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, one
+ mov oned, 0x00010001
+ lea stride3q, [strideq*3]
+ movd m3, oned
+ pshufd m3, m3, 0x0
+ paddw m0, m2
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ packssdw m0, m1
+ pmaddwd m0, m3
+ paddw m0, [GLOBAL(pw_8)]
+ psrlw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m3, [aboveq+16]
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_16)]
+ psrad m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_dc_predictor_32x32, 4, 5, 7, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [aboveq+32]
+ mova m4, [aboveq+48]
+ paddw m0, m2
+ paddw m3, m4
+ mova m2, [leftq]
+ mova m4, [leftq+16]
+ mova m5, [leftq+32]
+ mova m6, [leftq+48]
+ paddw m2, m4
+ paddw m5, m6
+ paddw m0, m3
+ paddw m2, m5
+ pxor m1, m1
+ paddw m0, m2
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ movhlps m2, m0
+ paddw m0, m2
+ punpcklwd m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ punpckldq m0, m1
+ movhlps m2, m0
+ paddd m0, m2
+ paddd m0, [GLOBAL(pw_32)]
+ psrad m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16 ], m0
+ mova [dstq +32 ], m0
+ mova [dstq +48 ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16 ], m0
+ mova [dstq+strideq*2+32 ], m0
+ mova [dstq+strideq*2+48 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4+16 ], m0
+ mova [dstq+strideq*4+32 ], m0
+ mova [dstq+strideq*4+48 ], m0
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m0
+ mova [dstq+stride3q*2 +32], m0
+ mova [dstq+stride3q*2 +48], m0
+ lea dstq, [dstq+strideq*8]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ lea dstq, [dstq+strideq*8]
+ mova [dstq ], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+stride3q*2], m0
+ RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2+16], m1
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ mova m2, [aboveq+32]
+ mova m3, [aboveq+48]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq +32], m2
+ mova [dstq +48], m3
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2 +16], m1
+ mova [dstq+strideq*2 +32], m2
+ mova [dstq+strideq*2 +48], m3
+ mova [dstq+strideq*4 ], m0
+ mova [dstq+strideq*4 +16], m1
+ mova [dstq+strideq*4 +32], m2
+ mova [dstq+strideq*4 +48], m3
+ mova [dstq+stride3q*2 ], m0
+ mova [dstq+stride3q*2 +16], m1
+ mova [dstq+stride3q*2 +32], m2
+ mova [dstq+stride3q*2 +48], m3
+ lea dstq, [dstq+strideq*8]
+ dec nlines4d
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
new file mode 100644
index 0000000000..6a2e915ed7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_intrapred_sse2.c
@@ -0,0 +1,984 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_highbd_h_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+void aom_highbd_h_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+ dst += stride << 2;
+ left += 4;
+ aom_highbd_h_predictor_4x4_sse2(dst, stride, above, left, bd);
+}
+
+void aom_highbd_h_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+}
+
+void aom_highbd_h_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ (void)above;
+ (void)bd;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row0, row0));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row1, row1));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row2, row2));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpacklo_epi64(row3, row3));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row4, row4));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row5, row5));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row6, row6));
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, _mm_unpackhi_epi64(row7, row7));
+}
+
+void aom_highbd_h_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+ dst += stride << 3;
+ left += 8;
+ aom_highbd_h_predictor_8x8_sse2(dst, stride, above, left, bd);
+}
+
+static INLINE void h_store_16_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)*dst, val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_16_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_16x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_16_unpacklo(&dst, stride, &row0);
+ h_store_16_unpacklo(&dst, stride, &row1);
+ h_store_16_unpacklo(&dst, stride, &row2);
+ h_store_16_unpacklo(&dst, stride, &row3);
+ h_store_16_unpackhi(&dst, stride, &row4);
+ h_store_16_unpackhi(&dst, stride, &row5);
+ h_store_16_unpackhi(&dst, stride, &row6);
+ h_store_16_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)bd;
+ h_predictor_16x8(dst, stride, left);
+}
+
+void aom_highbd_h_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_16x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+static INLINE void h_store_32_unpacklo(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpacklo_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_store_32_unpackhi(uint16_t **dst, const ptrdiff_t stride,
+ const __m128i *row) {
+ const __m128i val = _mm_unpackhi_epi64(*row, *row);
+ _mm_store_si128((__m128i *)(*dst), val);
+ _mm_store_si128((__m128i *)(*dst + 8), val);
+ _mm_store_si128((__m128i *)(*dst + 16), val);
+ _mm_store_si128((__m128i *)(*dst + 24), val);
+ *dst += stride;
+}
+
+static INLINE void h_predictor_32x8(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left) {
+ const __m128i left_u16 = _mm_load_si128((const __m128i *)left);
+ const __m128i row0 = _mm_shufflelo_epi16(left_u16, 0x0);
+ const __m128i row1 = _mm_shufflelo_epi16(left_u16, 0x55);
+ const __m128i row2 = _mm_shufflelo_epi16(left_u16, 0xaa);
+ const __m128i row3 = _mm_shufflelo_epi16(left_u16, 0xff);
+ const __m128i row4 = _mm_shufflehi_epi16(left_u16, 0x0);
+ const __m128i row5 = _mm_shufflehi_epi16(left_u16, 0x55);
+ const __m128i row6 = _mm_shufflehi_epi16(left_u16, 0xaa);
+ const __m128i row7 = _mm_shufflehi_epi16(left_u16, 0xff);
+ h_store_32_unpacklo(&dst, stride, &row0);
+ h_store_32_unpacklo(&dst, stride, &row1);
+ h_store_32_unpacklo(&dst, stride, &row2);
+ h_store_32_unpacklo(&dst, stride, &row3);
+ h_store_32_unpackhi(&dst, stride, &row4);
+ h_store_32_unpackhi(&dst, stride, &row5);
+ h_store_32_unpackhi(&dst, stride, &row6);
+ h_store_32_unpackhi(&dst, stride, &row7);
+}
+
+void aom_highbd_h_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 2; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+void aom_highbd_h_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ int i;
+ (void)above;
+ (void)bd;
+
+ for (i = 0; i < 4; i++, left += 8) {
+ h_predictor_32x8(dst, stride, left);
+ dst += stride << 3;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP, DC_LEFT, DC_128
+
+// 4x4
+
+static INLINE __m128i dc_sum_4(const uint16_t *ref) {
+ const __m128i _dcba = _mm_loadl_epi64((const __m128i *)ref);
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+static INLINE void dc_store_4x4(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 4; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x4(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x4(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 4x8
+
+static INLINE void dc_store_4x8(uint16_t *dst, ptrdiff_t stride,
+ const __m128i *dc) {
+ const __m128i dc_dup = _mm_shufflelo_epi16(*dc, 0x0);
+ int i;
+ for (i = 0; i < 8; ++i, dst += stride) {
+ _mm_storel_epi64((__m128i *)dst, dc_dup);
+ }
+}
+
+// Shared with DC 8xh
+static INLINE __m128i dc_sum_8(const uint16_t *ref) {
+ const __m128i ref_u16 = _mm_load_si128((const __m128i *)ref);
+ const __m128i _dcba = _mm_add_epi16(ref_u16, _mm_srli_si128(ref_u16, 8));
+ const __m128i _xxdc = _mm_shufflelo_epi16(_dcba, 0xe);
+ const __m128i a = _mm_add_epi16(_dcba, _xxdc);
+
+ return _mm_add_epi16(a, _mm_shufflelo_epi16(a, 0x1));
+}
+
+void aom_highbd_dc_left_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sum = dc_sum_8(left);
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_top_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)left;
+ (void)bd;
+ dc_store_4x8(dst, stride, &dc);
+}
+
+void aom_highbd_dc_128_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_4x8(dst, stride, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 8xh
+
+static INLINE void dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+static INLINE void dc_top_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, const uint16_t *above) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ dc_store_8xh(dst, stride, height, &dc);
+}
+
+void aom_highbd_dc_top_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 4, above);
+}
+
+void aom_highbd_dc_top_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 8, above);
+}
+
+void aom_highbd_dc_top_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ dc_top_predictor_8xh(dst, stride, 16, above);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i two = _mm_cvtsi32_si128(2);
+ const __m128i sum = dc_sum_4(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, two), 2);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 4, &dc);
+}
+
+void aom_highbd_dc_left_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 8, &dc);
+}
+
+// Shared with DC 16xh
+static INLINE __m128i dc_sum_16(const uint16_t *ref) {
+ const __m128i sum_lo = dc_sum_8(ref);
+ const __m128i sum_hi = dc_sum_8(ref + 8);
+ return _mm_add_epi16(sum_lo, sum_hi);
+}
+
+void aom_highbd_dc_left_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_8xh(dst, stride, 16, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+static INLINE void dc_128_predictor_8xh(uint16_t *dst, ptrdiff_t stride,
+ int height, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ dc_store_8xh(dst, stride, height, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 4, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 8, bd);
+}
+
+void aom_highbd_dc_128_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)above;
+ (void)left;
+ dc_128_predictor_8xh(dst, stride, 16, bd);
+}
+
+// -----------------------------------------------------------------------------
+// 16xh
+
+static INLINE void dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_highbd_dc_left_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i four = _mm_cvtsi32_si128(4);
+ const __m128i sum = dc_sum_8(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, four), 3);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_left_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+// Shared with 32xh
+static INLINE __m128i dc_sum_32(const uint16_t *ref) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sum_a = dc_sum_16(ref);
+ const __m128i sum_b = dc_sum_16(ref + 16);
+ // 12 bit bd will outrange, so expand to 32 bit before adding final total
+ return _mm_add_epi32(_mm_unpacklo_epi16(sum_a, zero),
+ _mm_unpacklo_epi16(sum_b, zero));
+}
+
+void aom_highbd_dc_left_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_highbd_dc_top_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 8, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_top_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(above);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)left;
+ (void)bd;
+ dc_store_16xh(dst, stride, 32, &dc);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_highbd_dc_128_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 8, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_128_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_16xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// 32xh
+
+static INLINE void dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int height,
+ const __m128i *dc) {
+ const __m128i dc_dup_lo = _mm_shufflelo_epi16(*dc, 0);
+ const __m128i dc_dup = _mm_unpacklo_epi64(dc_dup_lo, dc_dup_lo);
+ int i;
+ for (i = 0; i < height; ++i, dst += stride) {
+ _mm_store_si128((__m128i *)dst, dc_dup);
+ _mm_store_si128((__m128i *)(dst + 8), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 16), dc_dup);
+ _mm_store_si128((__m128i *)(dst + 24), dc_dup);
+ }
+}
+
+void aom_highbd_dc_left_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i eight = _mm_cvtsi32_si128(8);
+ const __m128i sum = dc_sum_16(left);
+ const __m128i dc = _mm_srli_epi16(_mm_add_epi16(sum, eight), 4);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_left_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(left);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)above;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_top_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 16, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 16, &dc_dup);
+}
+
+void aom_highbd_dc_top_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i sixteen = _mm_cvtsi32_si128(16);
+ const __m128i sum = dc_sum_32(above);
+ const __m128i dc = _mm_srli_epi32(_mm_add_epi32(sum, sixteen), 5);
+ (void)left;
+ (void)bd;
+ dc_store_32xh(dst, stride, 32, &dc);
+}
+
+void aom_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ const __m128i dc = _mm_cvtsi32_si128(1 << (bd - 1));
+ const __m128i dc_dup = _mm_shufflelo_epi16(dc, 0x0);
+ (void)above;
+ (void)left;
+ dc_store_32xh(dst, stride, 32, &dc_dup);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_highbd_v_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_loadl_epi64((const __m128i *)above);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_storel_epi64((__m128i *)dst, above_u16);
+ _mm_storel_epi64((__m128i *)(dst + stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+}
+
+void aom_highbd_v_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above_u16 = _mm_load_si128((const __m128i *)above);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above_u16);
+ _mm_store_si128((__m128i *)(dst + stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 2 * stride), above_u16);
+ _mm_store_si128((__m128i *)(dst + 3 * stride), above_u16);
+ dst += stride << 2;
+ }
+}
+
+void aom_highbd_v_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ dst += stride;
+ }
+}
+
+void aom_highbd_v_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)left;
+ (void)bd;
+ const __m128i above0_u16 = _mm_load_si128((const __m128i *)above);
+ const __m128i above1_u16 = _mm_load_si128((const __m128i *)(above + 8));
+ const __m128i above2_u16 = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i above3_u16 = _mm_load_si128((const __m128i *)(above + 24));
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, above0_u16);
+ _mm_store_si128((__m128i *)(dst + 8), above1_u16);
+ _mm_store_si128((__m128i *)(dst + 16), above2_u16);
+ _mm_store_si128((__m128i *)(dst + 24), above3_u16);
+ dst += stride;
+ }
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_highbd_dc_predictor_4x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_above = dc_sum_4(above);
+ const __m128i sum_left = dc_sum_8(left);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_8x4_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ const __m128i sum_left = dc_sum_4(left);
+ const __m128i sum_above = dc_sum_8(above);
+ const __m128i sum = _mm_add_epi16(sum_above, sum_left);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 >>= 16;
+ sum32 += 6;
+ sum32 /= 12;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+}
+
+void aom_highbd_dc_predictor_8x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x8_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 12;
+ sum32 /= 24;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 2; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_16x32_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_32(left);
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_above = _mm_unpacklo_epi16(sum_above, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 8; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ dst += stride;
+ }
+}
+
+void aom_highbd_dc_predictor_32x16_sse2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *above,
+ const uint16_t *left, int bd) {
+ (void)bd;
+ __m128i sum_left = dc_sum_16(left);
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i zero = _mm_setzero_si128();
+ sum_left = _mm_unpacklo_epi16(sum_left, zero);
+ const __m128i sum = _mm_add_epi32(sum_left, sum_above);
+ uint32_t sum32 = (uint32_t)_mm_cvtsi128_si32(sum);
+ sum32 += 24;
+ sum32 /= 48;
+ const __m128i row = _mm_set1_epi16((int16_t)sum32);
+ int i;
+ for (i = 0; i < 4; ++i) {
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ _mm_store_si128((__m128i *)dst, row);
+ _mm_store_si128((__m128i *)(dst + 8), row);
+ _mm_store_si128((__m128i *)(dst + 16), row);
+ _mm_store_si128((__m128i *)(dst + 24), row);
+ dst += stride;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
new file mode 100644
index 0000000000..c954da94e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_avx2.c
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/common_avx2.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+#include "aom/aom_integer.h"
+
+void aom_highbd_lpf_horizontal_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_14_dual_sse2(s, p, blimit0, limit0, thresh0,
+ blimit1, limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_14_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_14_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_horizontal_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_4_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_4_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
+
+void aom_highbd_lpf_vertical_8_dual_avx2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ aom_highbd_lpf_vertical_8_dual_sse2(s, p, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
new file mode 100644
index 0000000000..ea7dc6a9e5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_loopfilter_sse2.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static AOM_FORCE_INLINE void pixel_clamp(const __m128i *min, const __m128i *max,
+ __m128i *pixel) {
+ *pixel = _mm_min_epi16(*pixel, *max);
+ *pixel = _mm_max_epi16(*pixel, *min);
+}
+
+static AOM_FORCE_INLINE __m128i abs_diff16(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu16(a, b), _mm_subs_epu16(b, a));
+}
+
+static INLINE void get_limit(const uint8_t *bl, const uint8_t *l,
+ const uint8_t *t, int bd, __m128i *blt,
+ __m128i *lt, __m128i *thr, __m128i *t80_out) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)bl), zero);
+ *blt = _mm_slli_epi16(x, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)l), zero);
+ *lt = _mm_slli_epi16(x, shift);
+
+ x = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)t), zero);
+ *thr = _mm_slli_epi16(x, shift);
+
+ *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void get_limit_dual(
+ const uint8_t *_blimit0, const uint8_t *_limit0, const uint8_t *_thresh0,
+ const uint8_t *_blimit1, const uint8_t *_limit1, const uint8_t *_thresh1,
+ int bd, __m128i *blt_out, __m128i *lt_out, __m128i *thr_out,
+ __m128i *t80_out) {
+ const int shift = bd - 8;
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i x0 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit0), zero);
+ __m128i x1 =
+ _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_blimit1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *blt_out = _mm_slli_epi16(x0, shift);
+
+ x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit0), zero);
+ x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_limit1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *lt_out = _mm_slli_epi16(x0, shift);
+
+ x0 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh0), zero);
+ x1 = _mm_unpacklo_epi8(_mm_load_si128((const __m128i *)_thresh1), zero);
+ x0 = _mm_unpacklo_epi64(x0, x1);
+ *thr_out = _mm_slli_epi16(x0, shift);
+
+ *t80_out = _mm_set1_epi16(1 << (bd - 1));
+}
+
+static INLINE void load_highbd_pixel(const uint16_t *s, int size, int pitch,
+ __m128i *p, __m128i *q) {
+ int i;
+ for (i = 0; i < size; i++) {
+ p[i] = _mm_loadu_si128((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadu_si128((__m128i *)(s + i * pitch));
+ }
+}
+
+static INLINE void highbd_filter_mask_dual(const __m128i *p, const __m128i *q,
+ const __m128i *l, const __m128i *bl,
+ __m128i *mask) {
+ __m128i abs_p0q0 = abs_diff16(p[0], q[0]);
+ __m128i abs_p1q1 = abs_diff16(p[1], q[1]);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+
+ __m128i max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ int i;
+ for (i = 1; i < 4; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(p[i], p[i - 1]));
+ max = _mm_max_epi16(max, abs_diff16(q[i], q[i - 1]));
+ }
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // return ~mask
+}
+
+static INLINE void highbd_hev_filter_mask_x_sse2(__m128i *pq, int x,
+ __m128i *p1p0, __m128i *q1q0,
+ __m128i *abs_p1p0, __m128i *l,
+ __m128i *bl, __m128i *t,
+ __m128i *hev, __m128i *mask) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i ffff = _mm_set1_epi16((short)0xFFFF);
+ __m128i abs_p0q0_p1q1, abs_p0q0, abs_p1q1, abs_q1q0;
+ __m128i max, max01, h;
+
+ *p1p0 = _mm_unpacklo_epi64(pq[0], pq[1]);
+ *q1q0 = _mm_unpackhi_epi64(pq[0], pq[1]);
+
+ abs_p0q0_p1q1 = abs_diff16(*p1p0, *q1q0);
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0_p1q1, abs_p0q0_p1q1);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ abs_p1q1 = _mm_srli_si128(abs_p0q0_p1q1, 8);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1); // divide by 2
+
+ max = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), *bl);
+ max = _mm_xor_si128(_mm_cmpeq_epi16(max, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ max = _mm_and_si128(max, _mm_adds_epu16(*l, one));
+
+ *abs_p1p0 = abs_diff16(pq[0], pq[1]);
+ abs_q1q0 = _mm_srli_si128(*abs_p1p0, 8);
+ max01 = _mm_max_epi16(*abs_p1p0, abs_q1q0);
+ // mask |= (abs(*p1 - *p0) > limit) * -1;
+ // mask |= (abs(*q1 - *q0) > limit) * -1;
+ h = _mm_subs_epu16(max01, *t);
+
+ *hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+ // replicate for the further "merged variables" usage
+ *hev = _mm_unpacklo_epi64(*hev, *hev);
+
+ max = _mm_max_epi16(max, max01);
+ int i;
+ for (i = 2; i < x; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[i - 1]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ max = _mm_subs_epu16(max, *l);
+ *mask = _mm_cmpeq_epi16(max, zero); // ~mask
+}
+
+static INLINE void flat_mask_internal(const __m128i *th, const __m128i *pq,
+ int start, int end, __m128i *flat) {
+ int i;
+ __m128i max = _mm_max_epi16(abs_diff16(pq[start], pq[0]),
+ abs_diff16(pq[start + 1], pq[0]));
+
+ for (i = start + 2; i < end; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(pq[i], pq[0]));
+ }
+ max = _mm_max_epi16(max, _mm_srli_si128(max, 8));
+
+ __m128i ft;
+ ft = _mm_subs_epu16(max, *th);
+
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void flat_mask_internal_dual(const __m128i *th, const __m128i *p,
+ const __m128i *q, int start, int end,
+ __m128i *flat) {
+ int i;
+ __m128i max =
+ _mm_max_epi16(abs_diff16(q[start], q[0]), abs_diff16(p[start], p[0]));
+
+ for (i = start + 1; i < end; ++i) {
+ max = _mm_max_epi16(max, abs_diff16(p[i], p[0]));
+ max = _mm_max_epi16(max, abs_diff16(q[i], q[0]));
+ }
+
+ __m128i ft;
+ ft = _mm_subs_epu16(max, *th);
+
+ const __m128i zero = _mm_setzero_si128();
+ *flat = _mm_cmpeq_epi16(ft, zero);
+}
+
+static INLINE void highbd_flat_mask4_sse2(__m128i *pq, __m128i *flat,
+ __m128i *flat2, int bd) {
+ // check the distance 1,2,3 against 0
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal(&th, pq, 1, 4, flat);
+ flat_mask_internal(&th, pq, 4, 7, flat2);
+}
+
+static INLINE void highbd_flat_mask4_dual_sse2(const __m128i *p,
+ const __m128i *q, __m128i *flat,
+ __m128i *flat2, int bd) {
+ // check the distance 1,2,3 against 0
+ __m128i th = _mm_set1_epi16(1);
+ th = _mm_slli_epi16(th, bd - 8);
+ flat_mask_internal_dual(&th, p, q, 1, 4, flat);
+ flat_mask_internal_dual(&th, p, q, 4, 7, flat2);
+}
+
+static AOM_FORCE_INLINE void highbd_filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0, __m128i *t80,
+ int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+ const __m128i pmin = _mm_subs_epi16(zero, *t80);
+
+ const __m128i t3t4 = _mm_set_epi16(3, 3, 3, 3, 4, 4, 4, 4);
+ __m128i ps1ps0_work, qs1qs0_work, work;
+ __m128i filt, filter2filter1, filter2filt, filter1filt;
+
+ ps1ps0_work = _mm_subs_epi16(*p1p0, *t80);
+ qs1qs0_work = _mm_subs_epi16(*q1q0, *t80);
+
+ work = _mm_subs_epi16(ps1ps0_work, qs1qs0_work);
+ pixel_clamp(&pmin, &pmax, &work);
+ filt = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+
+ filt = _mm_subs_epi16(filt, work);
+ filt = _mm_subs_epi16(filt, work);
+ filt = _mm_subs_epi16(filt, work);
+ // (aom_filter + 3 * (qs0 - ps0)) & mask
+ pixel_clamp(&pmin, &pmax, &filt);
+ filt = _mm_and_si128(filt, *mask);
+ filt = _mm_unpacklo_epi64(filt, filt);
+
+ filter2filter1 = _mm_adds_epi16(filt, t3t4); /* signed_short_clamp */
+ pixel_clamp(&pmin, &pmax, &filter2filter1);
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 3); /* >> 3 */
+
+ filt = _mm_unpacklo_epi64(filter2filter1, filter2filter1);
+
+ // filt >> 1
+ filt = _mm_adds_epi16(filt, one);
+ filt = _mm_srai_epi16(filt, 1);
+ filt = _mm_andnot_si128(*hev, filt);
+
+ filter2filt = _mm_unpackhi_epi64(filter2filter1, filt);
+ filter1filt = _mm_unpacklo_epi64(filter2filter1, filt);
+
+ qs1qs0_work = _mm_subs_epi16(qs1qs0_work, filter1filt);
+ ps1ps0_work = _mm_adds_epi16(ps1ps0_work, filter2filt);
+
+ pixel_clamp(&pmin, &pmax, &qs1qs0_work);
+ pixel_clamp(&pmin, &pmax, &ps1ps0_work);
+
+ *qs1qs0 = _mm_adds_epi16(qs1qs0_work, *t80);
+ *ps1ps0 = _mm_adds_epi16(ps1ps0_work, *t80);
+}
+
+static INLINE void highbd_filter4_dual_sse2(__m128i *p, __m128i *q, __m128i *ps,
+ __m128i *qs, const __m128i *mask,
+ const __m128i *th, int bd,
+ __m128i *t80) {
+ __m128i ps0 = _mm_subs_epi16(p[0], *t80);
+ __m128i ps1 = _mm_subs_epi16(p[1], *t80);
+ __m128i qs0 = _mm_subs_epi16(q[0], *t80);
+ __m128i qs1 = _mm_subs_epi16(q[1], *t80);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i pmax =
+ _mm_subs_epi16(_mm_subs_epi16(_mm_slli_epi16(one, bd), one), *t80);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i pmin = _mm_subs_epi16(zero, *t80);
+ __m128i filter = _mm_subs_epi16(ps1, qs1);
+ pixel_clamp(&pmin, &pmax, &filter);
+
+ // hev_filter
+ __m128i hev;
+ const __m128i abs_p1p0 = abs_diff16(p[1], p[0]);
+ const __m128i abs_q1q0 = abs_diff16(q[1], q[0]);
+ __m128i h = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ h = _mm_subs_epu16(h, *th);
+ const __m128i ffff = _mm_cmpeq_epi16(h, h);
+ hev = _mm_xor_si128(_mm_cmpeq_epi16(h, zero), ffff);
+
+ filter = _mm_and_si128(filter, hev);
+
+ const __m128i x = _mm_subs_epi16(qs0, ps0);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ filter = _mm_adds_epi16(filter, x);
+ pixel_clamp(&pmin, &pmax, &filter);
+ filter = _mm_and_si128(filter, *mask);
+ const __m128i t3 = _mm_set1_epi16(3);
+ const __m128i t4 = _mm_set1_epi16(4);
+ __m128i filter1 = _mm_adds_epi16(filter, t4);
+ __m128i filter2 = _mm_adds_epi16(filter, t3);
+ pixel_clamp(&pmin, &pmax, &filter1);
+ pixel_clamp(&pmin, &pmax, &filter2);
+ filter1 = _mm_srai_epi16(filter1, 3);
+ filter2 = _mm_srai_epi16(filter2, 3);
+ qs0 = _mm_subs_epi16(qs0, filter1);
+ pixel_clamp(&pmin, &pmax, &qs0);
+ ps0 = _mm_adds_epi16(ps0, filter2);
+ pixel_clamp(&pmin, &pmax, &ps0);
+ qs[0] = _mm_adds_epi16(qs0, *t80);
+ ps[0] = _mm_adds_epi16(ps0, *t80);
+ filter = _mm_adds_epi16(filter1, one);
+ filter = _mm_srai_epi16(filter, 1);
+ filter = _mm_andnot_si128(hev, filter);
+ qs1 = _mm_subs_epi16(qs1, filter);
+ pixel_clamp(&pmin, &pmax, &qs1);
+ ps1 = _mm_adds_epi16(ps1, filter);
+ pixel_clamp(&pmin, &pmax, &ps1);
+ qs[1] = _mm_adds_epi16(qs1, *t80);
+ ps[1] = _mm_adds_epi16(ps1, *t80);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_sse2(
+ __m128i *p, __m128i *q, __m128i *pq, const unsigned char *blt,
+ const unsigned char *lt, const unsigned char *thr, int bd) {
+ int i;
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit, limit, thresh;
+ __m128i t80;
+ get_limit(blt, lt, thr, bd, &blimit, &limit, &thresh, &t80);
+
+ for (i = 0; i < 7; i++) {
+ pq[i] = _mm_unpacklo_epi64(p[i], q[i]);
+ }
+ __m128i mask, hevhev;
+ __m128i p1p0, q1q0, abs_p1p0;
+
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hevhev, &mask);
+
+ __m128i ps0ps1, qs0qs1;
+ // filter4
+ highbd_filter4_sse2(&p1p0, &q1q0, &hevhev, &mask, &qs0qs1, &ps0ps1, &t80, bd);
+
+ __m128i flat, flat2;
+ highbd_flat_mask4_sse2(pq, &flat, &flat2, bd);
+
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_and_si128(flat2, flat);
+
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+ // flat and wide flat calculations
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3], flat_pq[3];
+ __m128i flat2_p[6], flat2_q[6];
+ __m128i flat2_pq[6];
+ __m128i sum_p6, sum_p3;
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq[5], _mm_add_epi16(pq[4], pq[3]));
+ __m128i sum_lp = _mm_add_epi16(pq[0], _mm_add_epi16(pq[2], pq[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq[3], pq[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0]));
+
+ sum_p6 = _mm_add_epi16(pq[6], pq[6]);
+ sum_p3 = _mm_add_epi16(pq[3], pq[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, pq[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq[6], pq[0]), pq[1]);
+ work0_1 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[1], _mm_add_epi16(pq[2], pq[0])));
+
+ sum_lq = _mm_sub_epi16(sum_lp, pq[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+
+ work0 = _mm_add_epi16(sum_p3, pq[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, pq[1]);
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq[3]);
+ work0 = _mm_add_epi16(sum_p3, pq[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q[0]));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[2], _mm_add_epi16(pq[3], pq[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, pq[3]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[3], _mm_add_epi16(pq[4], pq[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, pq[2]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[4], _mm_add_epi16(pq[5], pq[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, pq[1]);
+
+ work0 = _mm_add_epi16(sum_p6,
+ _mm_add_epi16(pq[5], _mm_add_epi16(pq[6], pq[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ } // flat2
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // highbd_filter8
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+
+ for (i = 0; i < 3; i++) {
+ pq[i] = _mm_andnot_si128(flat, pq[i]);
+ flat_pq[i] = _mm_and_si128(flat, flat_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat_pq[i]);
+ }
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ pq[i] = _mm_andnot_si128(flat2, pq[i]);
+ flat2_pq[i] = _mm_and_si128(flat2, flat2_pq[i]);
+ pq[i] = _mm_or_si128(pq[i], flat2_pq[i]); // full list of pq values
+ }
+ }
+ } else {
+ pq[0] = _mm_unpacklo_epi64(ps0ps1, qs0qs1);
+ pq[1] = _mm_unpackhi_epi64(ps0ps1, qs0qs1);
+ }
+}
+
+void aom_highbd_lpf_horizontal_14_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ __m128i p[7], q[7], pq[7];
+ int i;
+
+ for (i = 0; i < 7; i++) {
+ p[i] = _mm_loadl_epi64((__m128i *)(s - (i + 1) * pitch));
+ q[i] = _mm_loadl_epi64((__m128i *)(s + i * pitch));
+ }
+
+ highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+ for (i = 0; i < 6; i++) {
+ _mm_storel_epi64((__m128i *)(s - (i + 1) * pitch), pq[i]);
+ _mm_storel_epi64((__m128i *)(s + i * pitch), _mm_srli_si128(pq[i], 8));
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_14_dual_sse2(
+ __m128i *p, __m128i *q, const uint8_t *blt0, const uint8_t *lt0,
+ const uint8_t *thr0, const uint8_t *blt1, const uint8_t *lt1,
+ const uint8_t *thr1, int bd) {
+ __m128i blimit, limit, thresh, t80;
+ const __m128i zero = _mm_setzero_si128();
+
+ get_limit_dual(blt0, lt0, thr0, blt1, lt1, thr1, bd, &blimit, &limit, &thresh,
+ &t80);
+ __m128i mask;
+ highbd_filter_mask_dual(p, q, &limit, &blimit, &mask);
+ __m128i flat, flat2;
+ highbd_flat_mask4_dual_sse2(p, q, &flat, &flat2, bd);
+
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_and_si128(flat2, flat);
+ __m128i ps[2], qs[2];
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh, bd, &t80);
+ // flat and wide flat calculations
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p_0 = _mm_add_epi16(p[5], _mm_add_epi16(p[4], p[3]));
+ __m128i sum_q = _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[3]));
+ __m128i sum_lp = _mm_add_epi16(p[0], _mm_add_epi16(p[2], p[1]));
+ sum_p_0 = _mm_add_epi16(sum_p_0, sum_lp);
+ __m128i sum_lq = _mm_add_epi16(q[0], _mm_add_epi16(q[2], q[1]));
+ sum_q = _mm_add_epi16(sum_q, sum_lq);
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p_0, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+ flat_p[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(p[3], p[0])), 3);
+ flat_q[0] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(q[3], q[0])), 3);
+ __m128i sum_p6 = _mm_add_epi16(p[6], p[6]);
+ __m128i sum_q6 = _mm_add_epi16(q[6], q[6]);
+ __m128i sum_p3 = _mm_add_epi16(p[3], p[3]);
+ __m128i sum_q3 = _mm_add_epi16(q[3], q[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, p[5]);
+ __m128i sum_p = _mm_sub_epi16(sum_p_0, q[5]);
+
+ sum_lq = _mm_sub_epi16(sum_lp, p[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q[2]);
+ flat_p[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[1])), 3);
+ flat_q[1] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[1])), 3);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q[1]);
+ sum_lq = _mm_sub_epi16(sum_lq, p[1]);
+ sum_p3 = _mm_add_epi16(sum_p3, p[3]);
+ sum_q3 = _mm_add_epi16(sum_q3, q[3]);
+ flat_p[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lp, _mm_add_epi16(sum_p3, p[2])), 3);
+ flat_q[2] =
+ _mm_srli_epi16(_mm_add_epi16(sum_lq, _mm_add_epi16(sum_q3, q[2])), 3);
+
+ int flat2_mask =
+ (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat2, zero)));
+ if (flat2_mask) {
+ flat2_p[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(p[6], p[0]),
+ _mm_add_epi16(p[1], q[0]))),
+ 4);
+ flat2_q[0] = _mm_srli_epi16(
+ _mm_add_epi16(sum_p_0, _mm_add_epi16(_mm_add_epi16(q[6], q[0]),
+ _mm_add_epi16(p[0], q[1]))),
+ 4);
+
+ flat2_p[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[1], _mm_add_epi16(p[2], p[0])))),
+ 4);
+ flat2_q[1] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[1], _mm_add_epi16(q[0], q[2])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[4]);
+ sum_q = _mm_sub_epi16(sum_q, p[4]);
+ flat2_p[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[2], _mm_add_epi16(p[3], p[1])))),
+ 4);
+ flat2_q[2] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[2], _mm_add_epi16(q[1], q[3])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[3]);
+ sum_q = _mm_sub_epi16(sum_q, p[3]);
+ flat2_p[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[3], _mm_add_epi16(p[4], p[2])))),
+ 4);
+ flat2_q[3] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[3], _mm_add_epi16(q[2], q[4])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[2]);
+ sum_q = _mm_sub_epi16(sum_q, p[2]);
+ flat2_p[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[4], _mm_add_epi16(p[5], p[3])))),
+ 4);
+ flat2_q[4] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[4], _mm_add_epi16(q[3], q[5])))),
+ 4);
+ sum_p6 = _mm_add_epi16(sum_p6, p[6]);
+ sum_q6 = _mm_add_epi16(sum_q6, q[6]);
+ sum_p = _mm_sub_epi16(sum_p, q[1]);
+ sum_q = _mm_sub_epi16(sum_q, p[1]);
+ flat2_p[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p[5], _mm_add_epi16(p[6], p[4])))),
+ 4);
+ flat2_q[5] = _mm_srli_epi16(
+ _mm_add_epi16(
+ sum_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q[5], _mm_add_epi16(q[4], q[6])))),
+ 4);
+ }
+ // highbd_filter8
+ int i;
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ p[2] = _mm_andnot_si128(flat, p[2]);
+ // p2 remains unchanged if !(flat && mask)
+ flat_p[2] = _mm_and_si128(flat, flat_p[2]);
+ // when (flat && mask)
+ p[2] = _mm_or_si128(p[2], flat_p[2]); // full list of p2 values
+ q[2] = _mm_andnot_si128(flat, q[2]);
+ flat_q[2] = _mm_and_si128(flat, flat_q[2]);
+ q[2] = _mm_or_si128(q[2], flat_q[2]); // full list of q2 values
+
+ for (i = 0; i < 2; i++) {
+ ps[i] = _mm_andnot_si128(flat, ps[i]);
+ flat_p[i] = _mm_and_si128(flat, flat_p[i]);
+ p[i] = _mm_or_si128(ps[i], flat_p[i]);
+ qs[i] = _mm_andnot_si128(flat, qs[i]);
+ flat_q[i] = _mm_and_si128(flat, flat_q[i]);
+ q[i] = _mm_or_si128(qs[i], flat_q[i]);
+ }
+ // highbd_filter16
+ if (flat2_mask) {
+ for (i = 0; i < 6; i++) {
+ // p[i] remains unchanged if !(flat2 && flat && mask)
+ p[i] = _mm_andnot_si128(flat2, p[i]);
+ flat2_p[i] = _mm_and_si128(flat2, flat2_p[i]);
+ // get values for when (flat2 && flat && mask)
+ p[i] = _mm_or_si128(p[i], flat2_p[i]); // full list of p values
+ q[i] = _mm_andnot_si128(flat2, q[i]);
+ flat2_q[i] = _mm_and_si128(flat2, flat2_q[i]);
+ q[i] = _mm_or_si128(q[i], flat2_q[i]);
+ }
+ }
+ } else {
+ p[0] = ps[0];
+ q[0] = qs[0];
+ p[1] = ps[1];
+ q[1] = qs[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p[7], q[7];
+ int i;
+ load_highbd_pixel(s, 7, pitch, p, q);
+
+ highbd_lpf_internal_14_dual_sse2(p, q, _blimit0, _limit0, _thresh0, _blimit1,
+ _limit1, _thresh1, bd);
+
+ for (i = 0; i < 6; i++) {
+ _mm_storeu_si128((__m128i *)(s - (i + 1) * pitch), p[i]);
+ _mm_storeu_si128((__m128i *)(s + i * pitch), q[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_sse2(
+ __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+ __m128i *q2, __m128i *p1p0_out, __m128i *q1q0_out, const uint8_t *_blimit,
+ const uint8_t *_limit, const uint8_t *_thresh, int bd) {
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i pq[3];
+ __m128i p1p0, q1q0, abs_p1p0, ps1ps0, qs1qs0;
+ __m128i flat_p1p0, flat_q0q1;
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i t80;
+ const __m128i one = _mm_set1_epi16(0x1);
+
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ highbd_hev_filter_mask_x_sse2(pq, 3, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+ // flat_mask
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_p1p0);
+ flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq[0], pq[0]), pq[1]); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq[1], pq[2]); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq[2], pq[2]), *q0);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_p1p0 = _mm_srli_epi16(workp_b, 3);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[2]),
+ pq[1]); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq[1]),
+ pq[0]); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(*q2, *q2);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ flat_q0q1 = _mm_srli_epi16(workp_a, 3);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_6_dual_sse2(
+ __m128i *p2, __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1,
+ __m128i *q2, const unsigned char *_blimit0, const unsigned char *_limit0,
+ const unsigned char *_thresh0, const unsigned char *_blimit1,
+ const unsigned char *_limit1, const unsigned char *_thresh1, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit0, limit0, thresh0;
+ __m128i t80;
+ __m128i mask, flat, work;
+ __m128i abs_p1q1, abs_p0q0, abs_p1p0, abs_p2p1, abs_q1q0, abs_q2q1;
+ __m128i op1, op0, oq0, oq1;
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i one = _mm_set1_epi16(0x1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ abs_p2p1 = abs_diff16(*p2, *p1);
+ abs_p1p0 = abs_diff16(*p1, *p0);
+ abs_q1q0 = abs_diff16(*q1, *q0);
+ abs_q2q1 = abs_diff16(*q2, *q1);
+
+ abs_p0q0 = abs_diff16(*p0, *q0);
+ abs_p1q1 = abs_diff16(*p1, *q1);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+ mask = _mm_max_epi16(abs_q2q1, mask);
+ work = _mm_max_epi16(abs_p1p0, abs_q1q0);
+ mask = _mm_max_epi16(work, mask);
+ mask = _mm_max_epi16(mask, abs_p2p1);
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
+ // flat_mask
+ flat = _mm_max_epi16(abs_diff16(*q2, *q0), abs_diff16(*p2, *p0));
+ flat = _mm_max_epi16(flat, work);
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+
+ // op1
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p0, *p0),
+ _mm_add_epi16(*p1, *p1)); // *p0 *2 + *p1 * 2
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+ *p2); // *p2 + *p0 * 2 + *p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(*p2, *p2), *q0);
+ workp_shft0 = _mm_add_epi16(
+ workp_a, workp_b); // *p2 * 3 + *p1 * 2 + *p0 * 2 + *q0 + 4
+ op1 = _mm_srli_epi16(workp_shft0, 3);
+
+ // op0
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q0), *q1); // *q0 * 2 + *q1
+ workp_a =
+ _mm_add_epi16(workp_a,
+ workp_b); // *p2 + *p0 * 2 + *p1 * 2 + *q0 * 2 + *q1 + 4
+ op0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, *p2),
+ *p1); // *p0 * 2 + *p1 + *q0 * 2 + *q1 + 4
+ workp_b = _mm_add_epi16(*q1, *q2);
+ workp_shft0 = _mm_add_epi16(
+ workp_a, workp_b); // *p0 * 2 + *p1 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ oq0 = _mm_srli_epi16(workp_shft0, 3);
+
+ // oq1
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_shft0, *p1),
+ *p0); // *p0 + *q0 * 2 + *q1 * 2 + *q2 + 4
+ workp_b = _mm_add_epi16(*q2, *q2);
+ workp_shft1 = _mm_add_epi16(
+ workp_a, workp_b); // *p0 + *q0 * 2 + *q1 * 2 + *q2 * 3 + 4
+ oq1 = _mm_srli_epi16(workp_shft1, 3);
+
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_6_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p1p0_out, q1q0_out;
+
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+ highbd_lpf_internal_6_sse2(&p2, &p1, &p0, &q0, &q1, &q2, &p1p0_out, &q1q0_out,
+ _blimit, _limit, _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0_out, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0_out);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0_out);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0_out, 8));
+}
+
+void aom_highbd_lpf_horizontal_6_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2;
+
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+ highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+ _limit0, _thresh0, _blimit1, _limit1,
+ _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ const unsigned char *_blimit, const unsigned char *_limit,
+ const unsigned char *_thresh, int bd) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev, flat;
+ __m128i pq[4];
+ __m128i p1p0, q1q0, ps1ps0, qs1qs0;
+ __m128i work_a, opq2, flat_p1p0, flat_q0q1;
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+ pq[2] = _mm_unpacklo_epi64(*p2, *q2);
+ pq[3] = _mm_unpacklo_epi64(*p3, *q3);
+
+ __m128i abs_p1p0;
+
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i t80;
+ const __m128i one = _mm_set1_epi16(0x1);
+
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ highbd_hev_filter_mask_x_sse2(pq, 4, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ // lp filter
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+
+ // flat_mask4
+ flat = _mm_max_epi16(abs_diff16(pq[2], pq[0]), abs_diff16(pq[3], pq[0]));
+ flat = _mm_max_epi16(abs_p1p0, flat);
+ flat = _mm_max_epi16(flat, _mm_srli_si128(flat, 8));
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b, workp_c, workp_shft0, workp_shft1;
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ // o*p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+ workp_c = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ workp_c = _mm_add_epi16(workp_a, workp_c);
+
+ // o*p1
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+ workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+ // o*p0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ flat_p1p0 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft1, workp_shft0), 3);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+ workp_shft0 = _mm_add_epi16(workp_a, workp_b);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ flat_q0q1 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_shft0, workp_shft1), 3);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+ workp_a = _mm_add_epi16(workp_a, workp_b);
+ opq2 = _mm_srli_epi16(_mm_unpacklo_epi64(workp_c, workp_a), 3);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+
+ work_a = _mm_andnot_si128(flat, pq[2]);
+ *p2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ *q2 = _mm_srli_si128(*p2, 8);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, const unsigned char *_blimit0,
+ const unsigned char *_limit0, const unsigned char *_thresh0,
+ const unsigned char *_blimit1, const unsigned char *_limit1,
+ const unsigned char *_thresh1, int bd) {
+ __m128i blimit0, limit0, thresh0;
+ __m128i t80;
+ __m128i mask, flat;
+ __m128i work_a, op2, oq2, op1, op0, oq0, oq1;
+ __m128i abs_p1q1, abs_p0q0, work0, work1, work2;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i one = _mm_set1_epi16(0x1);
+ const __m128i ffff = _mm_cmpeq_epi16(one, one);
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ abs_p0q0 = abs_diff16(*p0, *q0);
+ abs_p1q1 = abs_diff16(*p1, *q1);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - q0) * 2 + abs(*p1 - q1) / 2 > blimit) * -1;
+
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+
+ work0 = _mm_max_epi16(abs_diff16(*p3, *p2), abs_diff16(*p2, *p1));
+ work1 =
+ _mm_max_epi16(abs_diff16(*p1, *p0), abs_diff16(*q1, *q0)); // tbu 4 flat
+ work0 = _mm_max_epi16(work0, work1);
+ work2 = _mm_max_epi16(abs_diff16(*q2, *q1), abs_diff16(*q2, *q3));
+ work2 = _mm_max_epi16(work2, work0);
+ mask = _mm_max_epi16(work2, mask);
+
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ // lp filter
+ __m128i ps[2], qs[2], p[2], q[2];
+ {
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+ // filter_mask and hev_mask
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+ }
+
+ flat = _mm_max_epi16(abs_diff16(*p2, *p0), abs_diff16(*q2, *q0));
+ flat = _mm_max_epi16(work1, flat);
+ work0 = _mm_max_epi16(abs_diff16(*p3, *p0), abs_diff16(*q3, *q0));
+ flat = _mm_max_epi16(work0, flat);
+
+ flat = _mm_subs_epu16(flat, _mm_slli_epi16(one, bd - 8));
+ flat = _mm_cmpeq_epi16(flat, zero);
+ flat = _mm_and_si128(flat, mask); // flat & mask
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi16(flat, zero))) {
+ __m128i workp_a, workp_b;
+ // Added before shift for rounding part of ROUND_POWER_OF_TWO
+
+ // o*p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(*p3, *p3), _mm_add_epi16(*p2, *p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), *p0);
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *p2), *p3);
+ op2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // o*p1
+ workp_b = _mm_add_epi16(_mm_add_epi16(*q0, *q1), *p1);
+ op1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // o*p0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q2);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p1), *p0);
+ op0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p3), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *p0), *q0);
+ oq0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p2), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q0), *q1);
+ oq1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, *p1), *q3);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, *q1), *q2);
+ oq2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ qs[0] = _mm_andnot_si128(flat, qs[0]);
+ oq0 = _mm_and_si128(flat, oq0);
+ *q0 = _mm_or_si128(qs[0], oq0);
+
+ qs[1] = _mm_andnot_si128(flat, qs[1]);
+ oq1 = _mm_and_si128(flat, oq1);
+ *q1 = _mm_or_si128(qs[1], oq1);
+
+ ps[0] = _mm_andnot_si128(flat, ps[0]);
+ op0 = _mm_and_si128(flat, op0);
+ *p0 = _mm_or_si128(ps[0], op0);
+
+ ps[1] = _mm_andnot_si128(flat, ps[1]);
+ op1 = _mm_and_si128(flat, op1);
+ *p1 = _mm_or_si128(ps[1], op1);
+
+ work_a = _mm_andnot_si128(flat, *q2);
+ *q2 = _mm_and_si128(flat, oq2);
+ *q2 = _mm_or_si128(work_a, *q2);
+
+ work_a = _mm_andnot_si128(flat, *p2);
+ *p2 = _mm_and_si128(flat, op2);
+ *p2 = _mm_or_si128(work_a, *p2);
+ } else {
+ *q0 = qs[0];
+ *q1 = qs[1];
+ *p0 = ps[0];
+ *p1 = ps[1];
+ }
+}
+
+void aom_highbd_lpf_horizontal_8_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s + 0 * p));
+
+ highbd_lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0,
+ &p1p0, _blimit, _limit, _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_highbd_lpf_horizontal_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s + 0 * p));
+
+ highbd_lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0,
+ _blimit0, _limit0, _thresh0, _blimit1,
+ _limit1, _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *q1q0_out,
+ __m128i *p1p0_out, const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i blimit, limit, thresh;
+ __m128i mask, hev;
+ __m128i p1p0, q1q0;
+ __m128i pq[2];
+
+ __m128i abs_p1p0;
+
+ __m128i t80;
+ get_limit(_blimit, _limit, _thresh, bd, &blimit, &limit, &thresh, &t80);
+
+ pq[0] = _mm_unpacklo_epi64(*p0, *q0);
+ pq[1] = _mm_unpacklo_epi64(*p1, *q1);
+
+ highbd_hev_filter_mask_x_sse2(pq, 2, &p1p0, &q1q0, &abs_p1p0, &limit, &blimit,
+ &thresh, &hev, &mask);
+
+ highbd_filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out, &t80, bd);
+}
+
+static AOM_FORCE_INLINE void highbd_lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *ps,
+ __m128i *qs, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i blimit0, limit0, thresh0;
+ __m128i mask, flat;
+ __m128i p[2], q[2];
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i abs_p0q0 = abs_diff16(*q0, *p0);
+ __m128i abs_p1q1 = abs_diff16(*q1, *p1);
+
+ __m128i abs_p1p0 = abs_diff16(*p1, *p0);
+ __m128i abs_q1q0 = abs_diff16(*q1, *q0);
+
+ const __m128i ffff = _mm_cmpeq_epi16(abs_p1p0, abs_p1p0);
+ const __m128i one = _mm_set1_epi16(1);
+
+ __m128i t80;
+
+ get_limit_dual(_blimit0, _limit0, _thresh0, _blimit1, _limit1, _thresh1, bd,
+ &blimit0, &limit0, &thresh0, &t80);
+
+ // filter_mask and hev_mask
+ flat = _mm_max_epi16(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu16(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 1);
+
+ mask = _mm_subs_epu16(_mm_adds_epu16(abs_p0q0, abs_p1q1), blimit0);
+ mask = _mm_xor_si128(_mm_cmpeq_epi16(mask, zero), ffff);
+ // mask |= (abs(*p0 - *q0) * 2 + abs(*p1 - *q1) / 2 > blimit) * -1;
+ // So taking maximums continues to work:
+ mask = _mm_and_si128(mask, _mm_adds_epu16(limit0, one));
+ mask = _mm_max_epi16(flat, mask);
+
+ mask = _mm_subs_epu16(mask, limit0);
+ mask = _mm_cmpeq_epi16(mask, zero);
+
+ p[0] = *p0;
+ p[1] = *p1;
+ q[0] = *q0;
+ q[1] = *q1;
+
+ highbd_filter4_dual_sse2(p, q, ps, qs, &mask, &thresh0, bd, &t80);
+}
+
+void aom_highbd_lpf_horizontal_4_sse2(uint16_t *s, int p,
+ const uint8_t *_blimit,
+ const uint8_t *_limit,
+ const uint8_t *_thresh, int bd) {
+ __m128i p1p0, q1q0;
+ __m128i p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+ highbd_lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &q1q0, &p1p0, _blimit, _limit,
+ _thresh, bd);
+
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+void aom_highbd_lpf_horizontal_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ __m128i p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ __m128i q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ __m128i q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ __m128i ps[2], qs[2];
+
+ highbd_lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, ps, qs, _blimit0, _limit0,
+ _thresh0, _blimit1, _limit1, _thresh1, bd);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps[1]);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps[0]);
+ _mm_storeu_si128((__m128i *)(s + 0 * p), qs[0]);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs[1]);
+}
+
+void aom_highbd_lpf_vertical_4_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i x0, x1, x2, x3, d0, d1, d2, d3;
+ __m128i p1p0, q1q0;
+ __m128i p1, q1;
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+ highbd_transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &d0, &d1, &d2, &d3);
+
+ highbd_lpf_internal_4_sse2(&d0, &d1, &d2, &d3, &q1q0, &p1p0, blimit, limit,
+ thresh, bd);
+
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
+
+ // transpose from 8x4 to 4x8
+ highbd_transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_4_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i ps[2], qs[2];
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)(s - 2 + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)(s - 2 + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)(s - 2 + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)(s - 2 + 7 * p));
+
+ highbd_transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+ &d2, &d3);
+
+ highbd_lpf_internal_4_dual_sse2(&d0, &d1, &d2, &d3, ps, qs, blimit0, limit0,
+ thresh0, blimit1, limit1, thresh1, bd);
+
+ highbd_transpose4x8_8x4_sse2(&ps[1], &ps[0], &qs[0], &qs[1], &d0, &d1, &d2,
+ &d3, &d4, &d5, &d6, &d7);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+ _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+ _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+ _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+ _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_6_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x3, x2, x1, x0, p0, q0;
+ __m128i p1p0, q1q0;
+
+ x3 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+ x0 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+
+ highbd_transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ highbd_lpf_internal_6_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &p1p0, &q1q0, blimit,
+ limit, thresh, bd);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ highbd_transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_6_dual_sse2(
+ uint16_t *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1, int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i p0, q0, p1, q1, p2, q2;
+
+ x0 = _mm_loadu_si128((__m128i *)((s - 3) + 0 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 3) + 1 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 3) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 3) + 3 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 3) + 4 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 3) + 5 * p));
+ x6 = _mm_loadu_si128((__m128i *)((s - 3) + 6 * p));
+ x7 = _mm_loadu_si128((__m128i *)((s - 3) + 7 * p));
+
+ highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p2, &p1,
+ &p0, &q0, &q1, &q2, &d6, &d7);
+
+ highbd_lpf_internal_6_dual_sse2(&p2, &p1, &p0, &q0, &q1, &q2, _blimit0,
+ _limit0, _thresh0, _blimit1, _limit1,
+ _thresh1, bd);
+
+ highbd_transpose4x8_8x4_sse2(&p1, &p0, &q0, &q1, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ _mm_storel_epi64((__m128i *)(s - 2 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 2 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 2 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 2 + 3 * p), d3);
+ _mm_storel_epi64((__m128i *)(s - 2 + 4 * p), d4);
+ _mm_storel_epi64((__m128i *)(s - 2 + 5 * p), d5);
+ _mm_storel_epi64((__m128i *)(s - 2 + 6 * p), d6);
+ _mm_storel_epi64((__m128i *)(s - 2 + 7 * p), d7);
+}
+
+void aom_highbd_lpf_vertical_8_sse2(uint16_t *s, int p, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh,
+ int bd) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i p2, p1, p0, p3, q0;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadu_si128((__m128i *)((s - 4) + 0 * p));
+ p2 = _mm_loadu_si128((__m128i *)((s - 4) + 1 * p));
+ p1 = _mm_loadu_si128((__m128i *)((s - 4) + 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)((s - 4) + 3 * p));
+
+ highbd_transpose4x8_8x4_sse2(&p3, &p2, &p1, &p0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ // Loop filtering
+ highbd_lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0,
+ &p1p0, blimit, limit, thresh, bd);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ highbd_transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0,
+ &d1, &d2, &d3);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), d0);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), d1);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), d2);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_highbd_lpf_vertical_8_dual_sse2(
+ uint16_t *s, int p, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+ x0 = _mm_loadu_si128((__m128i *)(s - 4 + 0 * p));
+ x1 = _mm_loadu_si128((__m128i *)(s - 4 + 1 * p));
+ x2 = _mm_loadu_si128((__m128i *)(s - 4 + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)(s - 4 + 3 * p));
+ x4 = _mm_loadu_si128((__m128i *)(s - 4 + 4 * p));
+ x5 = _mm_loadu_si128((__m128i *)(s - 4 + 5 * p));
+ x6 = _mm_loadu_si128((__m128i *)(s - 4 + 6 * p));
+ x7 = _mm_loadu_si128((__m128i *)(s - 4 + 7 * p));
+
+ highbd_transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0, &d1,
+ &d2, &d3, &d4, &d5, &d6, &d7);
+
+ highbd_lpf_internal_8_dual_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4,
+ blimit0, limit0, thresh0, blimit1, limit1,
+ thresh1, bd);
+
+ highbd_transpose8x8_sse2(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &x0, &x1,
+ &x2, &x3, &x4, &x5, &x6, &x7);
+
+ _mm_storeu_si128((__m128i *)(s - 4 + 0 * p), x0);
+ _mm_storeu_si128((__m128i *)(s - 4 + 1 * p), x1);
+ _mm_storeu_si128((__m128i *)(s - 4 + 2 * p), x2);
+ _mm_storeu_si128((__m128i *)(s - 4 + 3 * p), x3);
+ _mm_storeu_si128((__m128i *)(s - 4 + 4 * p), x4);
+ _mm_storeu_si128((__m128i *)(s - 4 + 5 * p), x5);
+ _mm_storeu_si128((__m128i *)(s - 4 + 6 * p), x6);
+ _mm_storeu_si128((__m128i *)(s - 4 + 7 * p), x7);
+}
+
+void aom_highbd_lpf_vertical_14_sse2(uint16_t *s, int pitch,
+ const uint8_t *blimit,
+ const uint8_t *limit,
+ const uint8_t *thresh, int bd) {
+ __m128i q[7], p[7], pq[7];
+ __m128i p6, p5, p4, p3;
+ __m128i p6_2, p5_2, p4_2, p3_2;
+ __m128i d0, d1, d2, d3;
+ __m128i d0_2, d1_2, d2_2, d3_2, d7_2;
+
+ p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+
+ highbd_transpose4x8_8x4_sse2(&p6, &p5, &p4, &p3, &d0, &p[6], &p[5], &p[4],
+ &p[3], &p[2], &p[1], &p[0]);
+
+ p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+ p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+
+ highbd_transpose4x8_8x4_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &q[0], &q[1], &q[2],
+ &q[3], &q[4], &q[5], &q[6], &d7_2);
+
+ highbd_lpf_internal_14_sse2(p, q, pq, blimit, limit, thresh, bd);
+
+ highbd_transpose8x8_low_sse2(&d0, &p[6], &pq[5], &pq[4], &pq[3], &pq[2],
+ &pq[1], &pq[0], &d0, &d1, &d2, &d3);
+
+ q[0] = _mm_srli_si128(pq[0], 8);
+ q[1] = _mm_srli_si128(pq[1], 8);
+ q[2] = _mm_srli_si128(pq[2], 8);
+ q[3] = _mm_srli_si128(pq[3], 8);
+ q[4] = _mm_srli_si128(pq[4], 8);
+ q[5] = _mm_srli_si128(pq[5], 8);
+
+ highbd_transpose8x8_low_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6],
+ &d7_2, &d0_2, &d1_2, &d2_2, &d3_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0);
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_2);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_2);
+}
+
+void aom_highbd_lpf_vertical_14_dual_sse2(
+ uint16_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0,
+ const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1,
+ const uint8_t *thresh1, int bd) {
+ __m128i q[7], p[7];
+ __m128i p6, p5, p4, p3, p2, p1, p0, q0;
+ __m128i p6_2, p5_2, p4_2, p3_2, p2_2, p1_2, q0_2, p0_2;
+ __m128i d0, d7;
+ __m128i d0_out, d1_out, d2_out, d3_out, d4_out, d5_out, d6_out, d7_out;
+
+ p6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * pitch));
+ p5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * pitch));
+ p4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * pitch));
+ p3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * pitch));
+ p2 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * pitch));
+ p1 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * pitch));
+ p0 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * pitch));
+ q0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6, &p5, &p4, &p3, &p2, &p1, &p0, &q0, &d0, &p[6],
+ &p[5], &p[4], &p[3], &p[2], &p[1], &p[0]);
+
+ p6_2 = _mm_loadu_si128((__m128i *)(s + 0 * pitch));
+ p5_2 = _mm_loadu_si128((__m128i *)(s + 1 * pitch));
+ p4_2 = _mm_loadu_si128((__m128i *)(s + 2 * pitch));
+ p3_2 = _mm_loadu_si128((__m128i *)(s + 3 * pitch));
+ p2_2 = _mm_loadu_si128((__m128i *)(s + 4 * pitch));
+ p1_2 = _mm_loadu_si128((__m128i *)(s + 5 * pitch));
+ p0_2 = _mm_loadu_si128((__m128i *)(s + 6 * pitch));
+ q0_2 = _mm_loadu_si128((__m128i *)(s + 7 * pitch));
+
+ highbd_transpose8x8_sse2(&p6_2, &p5_2, &p4_2, &p3_2, &p2_2, &p1_2, &p0_2,
+ &q0_2, &q[0], &q[1], &q[2], &q[3], &q[4], &q[5],
+ &q[6], &d7);
+
+ highbd_lpf_internal_14_dual_sse2(p, q, blimit0, limit0, thresh0, blimit1,
+ limit1, thresh1, bd);
+
+ highbd_transpose8x8_sse2(&d0, &p[6], &p[5], &p[4], &p[3], &p[2], &p[1], &p[0],
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s - 8 + 7 * pitch), d7_out);
+
+ highbd_transpose8x8_sse2(&q[0], &q[1], &q[2], &q[3], &q[4], &q[5], &q[6], &d7,
+ &d0_out, &d1_out, &d2_out, &d3_out, &d4_out, &d5_out,
+ &d6_out, &d7_out);
+
+ _mm_storeu_si128((__m128i *)(s + 0 * pitch), d0_out);
+ _mm_storeu_si128((__m128i *)(s + 1 * pitch), d1_out);
+ _mm_storeu_si128((__m128i *)(s + 2 * pitch), d2_out);
+ _mm_storeu_si128((__m128i *)(s + 3 * pitch), d3_out);
+ _mm_storeu_si128((__m128i *)(s + 4 * pitch), d4_out);
+ _mm_storeu_si128((__m128i *)(s + 5 * pitch), d5_out);
+ _mm_storeu_si128((__m128i *)(s + 6 * pitch), d6_out);
+ _mm_storeu_si128((__m128i *)(s + 7 * pitch), d7_out);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
new file mode 100644
index 0000000000..950465cf46
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_avx2.c
@@ -0,0 +1,294 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
+ const __m128i sign = _mm_srai_epi16(*p, 15);
+ const __m128i dc = _mm_unpacklo_epi16(*p, sign);
+ const __m128i ac = _mm_unpackhi_epi16(*p, sign);
+ *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1);
+}
+
+static INLINE void update_qp(__m256i *qp) {
+ int i;
+ for (i = 0; i < 5; ++i) {
+ qp[i] = _mm256_permute2x128_si256(qp[i], qp[i], 0x11);
+ }
+}
+
+static INLINE void init_qp(const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr, const int16_t *dequant_ptr,
+ const int16_t *quant_shift_ptr, __m256i *qp,
+ int log_scale) {
+ const __m128i zbin = _mm_loadu_si128((const __m128i *)zbin_ptr);
+ const __m128i round = _mm_loadu_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_loadu_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_loadu_si128((const __m128i *)dequant_ptr);
+ const __m128i quant_shift = _mm_loadu_si128((const __m128i *)quant_shift_ptr);
+ init_one_qp(&zbin, &qp[0]);
+ init_one_qp(&round, &qp[1]);
+ init_one_qp(&quant, &qp[2]);
+ init_one_qp(&dequant, &qp[3]);
+ init_one_qp(&quant_shift, &qp[4]);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi32((int16_t)(1 << (log_scale - 1)));
+ qp[0] = _mm256_add_epi32(qp[0], rnd);
+ qp[0] = _mm256_srai_epi32(qp[0], log_scale);
+
+ qp[1] = _mm256_add_epi32(qp[1], rnd);
+ qp[1] = _mm256_srai_epi32(qp[1], log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi32() instruction when
+ // calculating the zbin mask.
+ qp[0] = _mm256_sub_epi32(qp[0], _mm256_set1_epi32(1));
+}
+
+// Note:
+// *x is vector multiplied by *y which is 16 int32_t parallel multiplication
+// and right shift 16. The output, 16 int32_t is save in *p.
+static INLINE __m256i mm256_mul_shift_epi32(const __m256i *x,
+ const __m256i *y) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+
+ prod_lo = _mm256_srli_epi64(prod_lo, 16);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16);
+
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE __m256i get_max_lane_eob(const int16_t *iscan_ptr,
+ __m256i eobmax,
+ __m256i nz_mask) {
+ const __m256i packed_nz_mask = _mm256_packs_epi32(nz_mask, nz_mask);
+ const __m256i packed_nz_mask_perm =
+ _mm256_permute4x64_epi64(packed_nz_mask, 0xD8);
+ const __m256i iscan =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)iscan_ptr));
+ const __m256i iscan_plus1 = _mm256_sub_epi16(iscan, packed_nz_mask_perm);
+ const __m256i nz_iscan = _mm256_and_si256(iscan_plus1, packed_nz_mask_perm);
+ return _mm256_max_epi16(eobmax, nz_iscan);
+}
+
+// Get the max eob from the lower 128 bits.
+static AOM_FORCE_INLINE uint16_t get_max_eob(__m256i eob) {
+ __m256i eob_s;
+ eob_s = _mm256_shuffle_epi32(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 0xe);
+ eob = _mm256_max_epi16(eob, eob_s);
+ eob_s = _mm256_shufflelo_epi16(eob, 1);
+ eob = _mm256_max_epi16(eob, eob_s);
+ return (uint16_t)_mm256_extract_epi16(eob, 0);
+}
+
+static AOM_FORCE_INLINE __m256i mm256_mul_shift_epi32_logscale(const __m256i *x,
+ const __m256i *y,
+ int log_scale) {
+ __m256i prod_lo = _mm256_mul_epi32(*x, *y);
+ __m256i prod_hi = _mm256_srli_epi64(*x, 32);
+ const __m256i mult_hi = _mm256_srli_epi64(*y, 32);
+ prod_hi = _mm256_mul_epi32(prod_hi, mult_hi);
+ prod_lo = _mm256_srli_epi64(prod_lo, 16 - log_scale);
+ const __m256i mask = _mm256_set_epi32(0, -1, 0, -1, 0, -1, 0, -1);
+ prod_lo = _mm256_and_si256(prod_lo, mask);
+ prod_hi = _mm256_srli_epi64(prod_hi, 16 - log_scale);
+ prod_hi = _mm256_slli_epi64(prod_hi, 32);
+ return _mm256_or_si256(prod_lo, prod_hi);
+}
+
+static AOM_FORCE_INLINE void quantize_logscale(
+ const __m256i *qp, const tran_low_t *coeff_ptr, const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob, int log_scale) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ // const int64_t tmp2 = ((tmpw * quant_ptr[rc != 0]) >> 16) + tmpw;
+ const __m256i tmp = mm256_mul_shift_epi32_logscale(&tmp_rnd, &qp[2], 0);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ // const int abs_qcoeff = (int)((tmp2 * quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i abs_q =
+ mm256_mul_shift_epi32_logscale(&tmp2, &qp[4], log_scale);
+ const __m256i abs_dq =
+ _mm256_srli_epi32(_mm256_mullo_epi32(abs_q, qp[3]), log_scale);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+static AOM_FORCE_INLINE void quantize(const __m256i *qp,
+ const tran_low_t *coeff_ptr,
+ const int16_t *iscan_ptr,
+ tran_low_t *qcoeff, tran_low_t *dqcoeff,
+ __m256i *eob) {
+ const __m256i coeff = _mm256_loadu_si256((const __m256i *)coeff_ptr);
+ const __m256i abs_coeff = _mm256_abs_epi32(coeff);
+ const __m256i zbin_mask = _mm256_cmpgt_epi32(abs_coeff, qp[0]);
+
+ if (UNLIKELY(_mm256_movemask_epi8(zbin_mask) == 0)) {
+ const __m256i zero = _mm256_setzero_si256();
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)dqcoeff, zero);
+ return;
+ }
+
+ const __m256i tmp_rnd =
+ _mm256_and_si256(_mm256_add_epi32(abs_coeff, qp[1]), zbin_mask);
+ const __m256i tmp = mm256_mul_shift_epi32(&tmp_rnd, &qp[2]);
+ const __m256i tmp2 = _mm256_add_epi32(tmp, tmp_rnd);
+ const __m256i abs_q = mm256_mul_shift_epi32(&tmp2, &qp[4]);
+ const __m256i abs_dq = _mm256_mullo_epi32(abs_q, qp[3]);
+ const __m256i nz_mask = _mm256_cmpgt_epi32(abs_q, _mm256_setzero_si256());
+ const __m256i q = _mm256_sign_epi32(abs_q, coeff);
+ const __m256i dq = _mm256_sign_epi32(abs_dq, coeff);
+
+ _mm256_storeu_si256((__m256i *)qcoeff, q);
+ _mm256_storeu_si256((__m256i *)dqcoeff, dq);
+
+ *eob = get_max_lane_eob(iscan_ptr, *eob, nz_mask);
+}
+
+void aom_highbd_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 0);
+
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_32x32_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const unsigned int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 1);
+
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 1);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
+
+void aom_highbd_quantize_b_64x64_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ const int step = 8;
+
+ __m256i eob = _mm256_setzero_si256();
+ __m256i qp[5];
+ init_qp(zbin_ptr, round_ptr, quant_ptr, dequant_ptr, quant_shift_ptr, qp, 2);
+
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+
+ update_qp(qp);
+
+ while (n_coeffs > 0) {
+ quantize_logscale(qp, coeff_ptr, iscan, qcoeff_ptr, dqcoeff_ptr, &eob, 2);
+
+ coeff_ptr += step;
+ qcoeff_ptr += step;
+ dqcoeff_ptr += step;
+ iscan += step;
+ n_coeffs -= step;
+ }
+
+ *eob_ptr = get_max_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
new file mode 100644
index 0000000000..3b0c42c4f5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -0,0 +1,208 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_mem/aom_mem.h"
+#include "aom_ports/mem.h"
+#include "config/aom_dsp_rtcd.h"
+
+void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
+ __m128i zbins[2];
+ __m128i nzbins[2];
+
+ zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
+ (int)zbin_ptr[0]);
+ zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ (void)scan;
+
+ memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = ((int)count / 4) - 1; i >= 0; i--) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (test == 0xffff)
+ non_zero_regs--;
+ else
+ break;
+ }
+
+ // Quantization pass:
+ for (i = 0; i < non_zero_regs; i++) {
+ __m128i coeffs, coeffs_sign, tmp1, tmp2;
+ int test;
+ int abs_coeff[4];
+ int coeff_sign[4];
+
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ coeffs_sign = _mm_srai_epi32(coeffs, 31);
+ coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
+ tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
+ tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
+ tmp1 = _mm_or_si128(tmp1, tmp2);
+ test = _mm_movemask_epi8(tmp1);
+ _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
+ _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
+
+ for (j = 0; j < 4; j++) {
+ if (test & (1 << (4 * j))) {
+ int k = 4 * i + j;
+ const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
+ const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
+ qcoeff_ptr[k] =
+ (int)(abs_qcoeff ^ (uint32_t)coeff_sign[j]) - coeff_sign[j];
+ dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
+ if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
+ }
+ }
+ }
+ *eob_ptr = eob_i + 1;
+}
+
+void aom_highbd_quantize_b_32x32_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
+
+void aom_highbd_quantize_b_64x64_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = AOMSIGN(coeff);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ (uint32_t)coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
new file mode 100644
index 0000000000..03839b493c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad4d_sse2.asm
@@ -0,0 +1,344 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_4x2x4 5-6 0
+ movh m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m4, [ref1q+%5*2]
+ movhps m5, [ref2q+%5*2]
+ movhps m6, [ref3q+%5*2]
+ movhps m7, [ref4q+%5*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ movu m2, [ref1q+%3*2]
+ movhps m0, [srcq +%4*2]
+ movhps m2, [ref1q+%5*2]
+ mova m3, m0
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m4, m2
+
+ movu m2, [ref2q+%3*2]
+ mova m3, m0
+ movhps m2, [ref2q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m5, m2
+
+ movu m2, [ref3q+%3*2]
+ mova m3, m0
+ movhps m2, [ref3q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m6, m2
+
+ movu m2, [ref4q+%3*2]
+ mova m3, m0
+ movhps m2, [ref4q+%5*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_8x2x4 5-6 0
+ ; 1st 8 px
+ mova m0, [srcq +%2*2]
+%if %1 == 1
+ movu m4, [ref1q+%3*2]
+ movu m5, [ref2q+%3*2]
+ movu m6, [ref3q+%3*2]
+ movu m7, [ref4q+%3*2]
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m4
+ psubusw m2, m5
+ psubusw m4, m0
+ psubusw m5, m0
+ por m4, m3
+ por m5, m2
+ pmaddwd m4, m1
+ pmaddwd m5, m1
+ mova m3, m0
+ mova m2, m0
+ psubusw m3, m6
+ psubusw m2, m7
+ psubusw m6, m0
+ psubusw m7, m0
+ por m6, m3
+ por m7, m2
+ pmaddwd m6, m1
+ pmaddwd m7, m1
+%else
+ mova m3, m0
+ movu m2, [ref1q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+%3*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endif
+
+ ; 2nd 8 px
+ mova m0, [srcq +(%4)*2]
+ mova m3, m0
+ movu m2, [ref1q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m4, m2
+ movu m2, [ref2q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m5, m2
+ movu m2, [ref3q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+ por m2, m3
+ mova m3, m0
+ pmaddwd m2, m1
+ paddd m6, m2
+ movu m2, [ref4q+(%5)*2]
+ psubusw m3, m2
+ psubusw m2, m0
+%if %6 == 1
+ lea srcq, [srcq +src_strideq*4]
+ lea ref1q, [ref1q+ref_strideq*4]
+ lea ref2q, [ref2q+ref_strideq*4]
+ lea ref3q, [ref3q+ref_strideq*4]
+ lea ref4q, [ref4q+ref_strideq*4]
+%endif
+ por m2, m3
+ pmaddwd m2, m1
+ paddd m7, m2
+%endmacro
+
+; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_16x2x4 5-6 0
+ HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
+ HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6
+%endmacro
+
+; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_32x2x4 5-6 0
+ HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
+ HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6
+%endmacro
+
+; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro HIGH_PROCESS_64x2x4 5-6 0
+ HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
+ HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6
+%endmacro
+
+; void aom_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, if 2, then skip every other row
+%macro HIGH_SADNXN4D 2-3 0
+%if %3 == 0 ; normal sad
+%if AOM_ARCH_X86_64
+cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ; AOM_ARCH_X86_64
+%else ; %3 == 2, downsample
+%if AOM_ARCH_X86_64
+cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
+ res, ref2, ref3, ref4
+%else
+cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
+ ref2, ref3, ref4
+%endif ; AOM_ARCH_X86_64
+%endif ; sad/avg/skip
+
+; set m1
+ push srcq
+ mov srcd, 0x00010001
+ movd m1, srcd
+ pshufd m1, m1, 0x0
+ pop srcq
+
+%if %3 == 2 ; skip rows
+ lea src_strided, [2*src_strided]
+ lea ref_strided, [2*ref_strided]
+%endif ; skip rows
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; convert byte pointers to short pointers
+ shl srcq, 1
+ shl ref2q, 1
+ shl ref3q, 1
+ shl ref4q, 1
+ shl ref1q, 1
+
+ HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
+%if %3 == 2 ; Downsampling by two
+%define num_rep (%2-8)/4
+%else
+%define num_rep (%2-4)/2
+%endif
+%rep num_rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
+%endrep
+%undef rep
+ HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
+ ; N.B. HIGH_PROCESS outputs dwords (32 bits)
+ ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
+ movhlps m0, m4
+ movhlps m1, m5
+ movhlps m2, m6
+ movhlps m3, m7
+ paddd m4, m0
+ paddd m5, m1
+ paddd m6, m2
+ paddd m7, m3
+ punpckldq m4, m5
+ punpckldq m6, m7
+ movhlps m0, m4
+ movhlps m1, m6
+ paddd m4, m0
+ paddd m6, m1
+ punpcklqdq m4, m6
+%if %3 == 2 ; skip rows
+ pslld m4, 1
+%endif
+ movifnidn r4, r4mp
+ movu [r4], m4
+ RET
+%endmacro
+
+
+INIT_XMM sse2
+HIGH_SADNXN4D 64, 64
+HIGH_SADNXN4D 64, 32
+HIGH_SADNXN4D 32, 64
+HIGH_SADNXN4D 32, 32
+HIGH_SADNXN4D 32, 16
+HIGH_SADNXN4D 16, 32
+HIGH_SADNXN4D 16, 16
+HIGH_SADNXN4D 16, 8
+HIGH_SADNXN4D 8, 16
+HIGH_SADNXN4D 8, 8
+HIGH_SADNXN4D 8, 4
+HIGH_SADNXN4D 4, 8
+HIGH_SADNXN4D 4, 4
+HIGH_SADNXN4D 4, 16
+HIGH_SADNXN4D 16, 4
+HIGH_SADNXN4D 8, 32
+HIGH_SADNXN4D 32, 8
+HIGH_SADNXN4D 16, 64
+HIGH_SADNXN4D 64, 16
+
+HIGH_SADNXN4D 64, 64, 2
+HIGH_SADNXN4D 64, 32, 2
+HIGH_SADNXN4D 32, 64, 2
+HIGH_SADNXN4D 32, 32, 2
+HIGH_SADNXN4D 32, 16, 2
+HIGH_SADNXN4D 16, 32, 2
+HIGH_SADNXN4D 16, 16, 2
+HIGH_SADNXN4D 16, 8, 2
+HIGH_SADNXN4D 8, 16, 2
+HIGH_SADNXN4D 8, 8, 2
+HIGH_SADNXN4D 4, 8, 2
+HIGH_SADNXN4D 4, 16, 2
+HIGH_SADNXN4D 8, 32, 2
+HIGH_SADNXN4D 32, 8, 2
+HIGH_SADNXN4D 16, 64, 2
+HIGH_SADNXN4D 64, 16, 2
+
+; Current code cannot handle the case when the height is downsampled to 2
+; HIGH_SADNXN4D 16, 4, 2
+; HIGH_SADNXN4D 8, 4, 2
+; HIGH_SADNXN4D 4, 4, 2
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
new file mode 100644
index 0000000000..6c78eeeefb
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_avx2.c
@@ -0,0 +1,720 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_ports/mem.h"
+
+// SAD
+static INLINE unsigned int get_sad_from_mm256_epi32(const __m256i *v) {
+ // input 8 32-bit summation
+ __m128i lo128, hi128;
+ __m256i u = _mm256_srli_si256(*v, 8);
+ u = _mm256_add_epi32(u, *v);
+
+ // 4 32-bit summation
+ hi128 = _mm256_extracti128_si256(u, 1);
+ lo128 = _mm256_castsi256_si128(u);
+ lo128 = _mm_add_epi32(hi128, lo128);
+
+ // 2 32-bit summation
+ hi128 = _mm_srli_si128(lo128, 4);
+ lo128 = _mm_add_epi32(lo128, hi128);
+
+ return (unsigned int)_mm_cvtsi128_si32(lo128);
+}
+
+static INLINE void highbd_sad16x4_core_avx2(__m256i *s, __m256i *r,
+ __m256i *sad_acc) {
+ const __m256i zero = _mm256_setzero_si256();
+ int i;
+ for (i = 0; i < 4; i++) {
+ s[i] = _mm256_sub_epi16(s[i], r[i]);
+ s[i] = _mm256_abs_epi16(s[i]);
+ }
+
+ s[0] = _mm256_add_epi16(s[0], s[1]);
+ s[0] = _mm256_add_epi16(s[0], s[2]);
+ s[0] = _mm256_add_epi16(s[0], s[3]);
+
+ r[0] = _mm256_unpacklo_epi16(s[0], zero);
+ r[1] = _mm256_unpackhi_epi16(s[0], zero);
+
+ r[0] = _mm256_add_epi32(r[0], r[1]);
+ *sad_acc = _mm256_add_epi32(*sad_acc, r[0]);
+}
+
+// If sec_ptr = 0, calculate regular SAD. Otherwise, calculate average SAD.
+static INLINE void sad16x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 2 * src_stride));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 3 * src_stride));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 2 * ref_stride));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 3 * ref_stride));
+
+ if (sec_ptr) {
+ r[0] = _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad16xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *ref_ptr = CONVERT_TO_SHORTPTR(ref);
+ int i;
+ __m256i sad = _mm256_setzero_si256();
+ for (i = 0; i < N; i += 4) {
+ sad16x4(src_ptr, src_stride, ref_ptr, ref_stride, NULL, &sad);
+ src_ptr += src_stride << 2;
+ ref_ptr += ref_stride << 2;
+ }
+ return (unsigned int)get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad32x4(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int row_sections = 0;
+
+ while (row_sections < 2) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + src_stride + 16));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + ref_stride + 16));
+
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 32 << 1;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+
+ row_sections += 1;
+ src_ptr += src_stride << 1;
+ ref_ptr += ref_stride << 1;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad32xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 2;
+ int i;
+
+ for (i = 0; i < N; i += 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad64x2(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *ref_ptr, int ref_stride,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int i;
+ for (i = 0; i < 2; i++) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 64;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad64xN_avx2(int N,
+ const uint8_t *src,
+ int src_stride,
+ const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ const int left_shift = 1;
+ int i;
+ for (i = 0; i < N; i += 2) {
+ sad64x2(srcp, src_stride, refp, ref_stride, NULL, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+static void sad128x1(const uint16_t *src_ptr, const uint16_t *ref_ptr,
+ const uint16_t *sec_ptr, __m256i *sad_acc) {
+ __m256i s[4], r[4];
+ int i;
+ for (i = 0; i < 2; i++) {
+ s[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ s[1] = _mm256_loadu_si256((const __m256i *)(src_ptr + 16));
+ s[2] = _mm256_loadu_si256((const __m256i *)(src_ptr + 32));
+ s[3] = _mm256_loadu_si256((const __m256i *)(src_ptr + 48));
+ r[0] = _mm256_loadu_si256((const __m256i *)ref_ptr);
+ r[1] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 16));
+ r[2] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 32));
+ r[3] = _mm256_loadu_si256((const __m256i *)(ref_ptr + 48));
+ if (sec_ptr) {
+ r[0] =
+ _mm256_avg_epu16(r[0], _mm256_loadu_si256((const __m256i *)sec_ptr));
+ r[1] = _mm256_avg_epu16(
+ r[1], _mm256_loadu_si256((const __m256i *)(sec_ptr + 16)));
+ r[2] = _mm256_avg_epu16(
+ r[2], _mm256_loadu_si256((const __m256i *)(sec_ptr + 32)));
+ r[3] = _mm256_avg_epu16(
+ r[3], _mm256_loadu_si256((const __m256i *)(sec_ptr + 48)));
+ sec_ptr += 64;
+ }
+ highbd_sad16x4_core_avx2(s, r, sad_acc);
+ src_ptr += 64;
+ ref_ptr += 64;
+ }
+}
+
+static AOM_FORCE_INLINE unsigned int aom_highbd_sad128xN_avx2(
+ int N, const uint8_t *src, int src_stride, const uint8_t *ref,
+ int ref_stride) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ int row = 0;
+ while (row < N) {
+ sad128x1(srcp, refp, NULL, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ row++;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+#define HIGHBD_SADMXN_AVX2(m, n) \
+ unsigned int aom_highbd_sad##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return aom_highbd_sad##m##xN_avx2(n, src, src_stride, ref, ref_stride); \
+ }
+
+#define HIGHBD_SAD_SKIP_MXN_AVX2(m, n) \
+ unsigned int aom_highbd_sad_skip_##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, \
+ int ref_stride) { \
+ return 2 * aom_highbd_sad##m##xN_avx2((n / 2), src, 2 * src_stride, ref, \
+ 2 * ref_stride); \
+ }
+
+HIGHBD_SADMXN_AVX2(16, 4)
+HIGHBD_SADMXN_AVX2(16, 8)
+HIGHBD_SADMXN_AVX2(16, 16)
+HIGHBD_SADMXN_AVX2(16, 32)
+HIGHBD_SADMXN_AVX2(16, 64)
+
+HIGHBD_SADMXN_AVX2(32, 8)
+HIGHBD_SADMXN_AVX2(32, 16)
+HIGHBD_SADMXN_AVX2(32, 32)
+HIGHBD_SADMXN_AVX2(32, 64)
+
+HIGHBD_SADMXN_AVX2(64, 16)
+HIGHBD_SADMXN_AVX2(64, 32)
+HIGHBD_SADMXN_AVX2(64, 64)
+HIGHBD_SADMXN_AVX2(64, 128)
+
+HIGHBD_SADMXN_AVX2(128, 64)
+HIGHBD_SADMXN_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXN_AVX2(128, 128)
+
+unsigned int aom_highbd_sad16x4_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x8_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+
+ // Next 4 rows
+ srcp += src_stride << 2;
+ refp += ref_stride << 2;
+ secp += 64;
+ sad16x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad16x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 3;
+ uint32_t sum = aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x8_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad16x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad16x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 16 << left_shift;
+ sum += aom_highbd_sad16x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x8_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 2) {
+ sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 32 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 2;
+ int row_section = 0;
+
+ while (row_section < 4) {
+ sad32x4(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 32 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad32x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 4;
+ uint32_t sum = aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x16_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad32x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 32 << left_shift;
+ sum += aom_highbd_sad32x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad64x16_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 8) {
+ sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 64 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x32_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ const int left_shift = 1;
+ int row_section = 0;
+
+ while (row_section < 16) {
+ sad64x2(srcp, src_stride, refp, ref_stride, secp, &sad);
+ srcp += src_stride << left_shift;
+ refp += ref_stride << left_shift;
+ secp += 64 << left_shift;
+ row_section += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad64x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 5;
+ uint32_t sum = aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x32_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad64x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ const int left_shift = 6;
+ uint32_t sum = aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 64 << left_shift;
+ sum += aom_highbd_sad64x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+unsigned int aom_highbd_sad128x64_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ __m256i sad = _mm256_setzero_si256();
+ uint16_t *srcp = CONVERT_TO_SHORTPTR(src);
+ uint16_t *refp = CONVERT_TO_SHORTPTR(ref);
+ uint16_t *secp = CONVERT_TO_SHORTPTR(second_pred);
+ int row = 0;
+ while (row < 64) {
+ sad128x1(srcp, refp, secp, &sad);
+ srcp += src_stride;
+ refp += ref_stride;
+ secp += 16 << 3;
+ row += 1;
+ }
+ return get_sad_from_mm256_epi32(&sad);
+}
+
+unsigned int aom_highbd_sad128x128_avg_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int sum;
+ const int left_shift = 6;
+
+ sum = aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ src += src_stride << left_shift;
+ ref += ref_stride << left_shift;
+ second_pred += 128 << left_shift;
+ sum += aom_highbd_sad128x64_avg_avx2(src, src_stride, ref, ref_stride,
+ second_pred);
+ return sum;
+}
+
+// SAD 4D
+// Combine 4 __m256i input vectors v to uint32_t result[4]
+static INLINE void get_4d_sad_from_mm256_epi32(const __m256i *v,
+ uint32_t *res) {
+ __m256i u0, u1, u2, u3;
+ const __m256i mask = yy_set1_64_from_32i(~0);
+ __m128i sad;
+
+ // 8 32-bit summation
+ u0 = _mm256_srli_si256(v[0], 4);
+ u1 = _mm256_srli_si256(v[1], 4);
+ u2 = _mm256_srli_si256(v[2], 4);
+ u3 = _mm256_srli_si256(v[3], 4);
+
+ u0 = _mm256_add_epi32(u0, v[0]);
+ u1 = _mm256_add_epi32(u1, v[1]);
+ u2 = _mm256_add_epi32(u2, v[2]);
+ u3 = _mm256_add_epi32(u3, v[3]);
+
+ u0 = _mm256_and_si256(u0, mask);
+ u1 = _mm256_and_si256(u1, mask);
+ u2 = _mm256_and_si256(u2, mask);
+ u3 = _mm256_and_si256(u3, mask);
+ // 4 32-bit summation, evenly positioned
+
+ u1 = _mm256_slli_si256(u1, 4);
+ u3 = _mm256_slli_si256(u3, 4);
+
+ u0 = _mm256_or_si256(u0, u1);
+ u2 = _mm256_or_si256(u2, u3);
+ // 8 32-bit summation, interleaved
+
+ u1 = _mm256_unpacklo_epi64(u0, u2);
+ u3 = _mm256_unpackhi_epi64(u0, u2);
+
+ u0 = _mm256_add_epi32(u1, u3);
+ sad = _mm_add_epi32(_mm256_extractf128_si256(u0, 1),
+ _mm256_castsi256_si128(u0));
+ _mm_storeu_si128((__m128i *)res, sad);
+}
+
+static void convert_pointers(const uint8_t *const ref8[],
+ const uint16_t *ref[]) {
+ ref[0] = CONVERT_TO_SHORTPTR(ref8[0]);
+ ref[1] = CONVERT_TO_SHORTPTR(ref8[1]);
+ ref[2] = CONVERT_TO_SHORTPTR(ref8[2]);
+ ref[3] = CONVERT_TO_SHORTPTR(ref8[3]);
+}
+
+static void init_sad(__m256i *s) {
+ s[0] = _mm256_setzero_si256();
+ s[1] = _mm256_setzero_si256();
+ s[2] = _mm256_setzero_si256();
+ s[3] = _mm256_setzero_si256();
+}
+
+static AOM_FORCE_INLINE void aom_highbd_sadMxNxD_avx2(
+ int M, int N, int D, const uint8_t *src, int src_stride,
+ const uint8_t *const ref_array[4], int ref_stride, uint32_t sad_array[4]) {
+ __m256i sad_vec[4];
+ const uint16_t *refp[4];
+ const uint16_t *keep = CONVERT_TO_SHORTPTR(src);
+ const uint16_t *srcp;
+ const int shift_for_rows = (M < 128) + (M < 64);
+ const int row_units = 1 << shift_for_rows;
+ int i, r;
+
+ init_sad(sad_vec);
+ convert_pointers(ref_array, refp);
+
+ for (i = 0; i < D; ++i) {
+ srcp = keep;
+ for (r = 0; r < N; r += row_units) {
+ if (M == 128) {
+ sad128x1(srcp, refp[i], NULL, &sad_vec[i]);
+ } else if (M == 64) {
+ sad64x2(srcp, src_stride, refp[i], ref_stride, NULL, &sad_vec[i]);
+ } else if (M == 32) {
+ sad32x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else if (M == 16) {
+ sad16x4(srcp, src_stride, refp[i], ref_stride, 0, &sad_vec[i]);
+ } else {
+ assert(0);
+ }
+ srcp += src_stride << shift_for_rows;
+ refp[i] += ref_stride << shift_for_rows;
+ }
+ }
+ get_4d_sad_from_mm256_epi32(sad_vec, sad_array);
+}
+
+#define HIGHBD_SAD_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 4, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
+#define HIGHBD_SAD_SKIP_MXNX4D_AVX2(m, n) \
+ void aom_highbd_sad_skip_##m##x##n##x4d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, (n / 2), 4, src, 2 * src_stride, ref_array, \
+ 2 * ref_stride, sad_array); \
+ sad_array[0] <<= 1; \
+ sad_array[1] <<= 1; \
+ sad_array[2] <<= 1; \
+ sad_array[3] <<= 1; \
+ }
+#define HIGHBD_SAD_MXNX3D_AVX2(m, n) \
+ void aom_highbd_sad##m##x##n##x3d_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ aom_highbd_sadMxNxD_avx2(m, n, 3, src, src_stride, ref_array, ref_stride, \
+ sad_array); \
+ }
+
+HIGHBD_SAD_MXNX4D_AVX2(16, 4)
+HIGHBD_SAD_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(16, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 8)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(32, 64)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 16)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 32)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(64, 128)
+
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 64)
+HIGHBD_SAD_SKIP_MXNX4D_AVX2(128, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(16, 4)
+HIGHBD_SAD_MXNX3D_AVX2(16, 8)
+HIGHBD_SAD_MXNX3D_AVX2(16, 16)
+HIGHBD_SAD_MXNX3D_AVX2(16, 32)
+HIGHBD_SAD_MXNX3D_AVX2(16, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(32, 8)
+HIGHBD_SAD_MXNX3D_AVX2(32, 16)
+HIGHBD_SAD_MXNX3D_AVX2(32, 32)
+HIGHBD_SAD_MXNX3D_AVX2(32, 64)
+
+HIGHBD_SAD_MXNX3D_AVX2(64, 16)
+HIGHBD_SAD_MXNX3D_AVX2(64, 32)
+HIGHBD_SAD_MXNX3D_AVX2(64, 64)
+HIGHBD_SAD_MXNX3D_AVX2(64, 128)
+
+HIGHBD_SAD_MXNX3D_AVX2(128, 64)
+HIGHBD_SAD_MXNX3D_AVX2(128, 128)
diff --git a/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
new file mode 100644
index 0000000000..3dc4e4e0a2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_sad_sse2.asm
@@ -0,0 +1,524 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
+%macro HIGH_SAD_FN 4-5 7
+%if %4 == 0
+%if %3 == 5
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%elif %4 == 1 ; avg
+%if %3 == 5
+cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%else ; %4 == 2, skip rows
+%if %3 == 5
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2 ; double the stride if we are skipping rows
+ lea src_strided, [src_strided*2]
+ lea ref_strided, [ref_strided*2]
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+; convert src, ref & second_pred to short ptrs (from byte ptrs)
+ shl srcq, 1
+ shl refq, 1
+%if %4 == 1
+ shl second_predq, 1
+%endif
+%endmacro
+
+; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD64XN 1-2 0
+ HIGH_SAD_FN 64, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ ; first half of each row
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ paddd m0, m1
+ paddd m0, m3
+ ; second half of each row
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq+64]
+ psubusw m5, m1
+ psubusw m1, [srcq+64]
+ por m1, m5
+ mova m5, [srcq+80]
+ psubusw m5, m2
+ psubusw m2, [srcq+80]
+ por m2, m5
+ mova m5, [srcq+96]
+ psubusw m5, m3
+ psubusw m3, [srcq+96]
+ por m3, m5
+ mova m5, [srcq+112]
+ psubusw m5, m4
+ psubusw m4, [srcq+112]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
+HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
+HIGH_SAD64XN 16 ; highbd_sad_64x16_sse2
+HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
+HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
+HIGH_SAD64XN 16, 1 ; highbd_sad_64x16_avg_sse2
+HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
+HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
+HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
+
+; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD32XN 1-2 0
+ HIGH_SAD_FN 32, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+32]
+ psubusw m5, m3
+ psubusw m3, [srcq+32]
+ por m3, m5
+ mova m5, [srcq+48]
+ psubusw m5, m4
+ psubusw m4, [srcq+48]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
+HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
+HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
+HIGH_SAD32XN 8 ; highbd_sad_32x8_sse2
+HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
+HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
+HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
+HIGH_SAD32XN 8, 1 ; highbd_sad_32x8_avg_sse2
+HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
+HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
+HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
+HIGH_SAD32XN 8, 2 ; highbd_sad_skip_32x8_sse2
+
+; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD16XN 1-2 0
+ HIGH_SAD_FN 16, %1, 5, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_strideq*2+16]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+16]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*2+16]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m5, [srcq]
+ psubusw m5, m1
+ psubusw m1, [srcq]
+ por m1, m5
+ mova m5, [srcq+16]
+ psubusw m5, m2
+ psubusw m2, [srcq+16]
+ por m2, m5
+ mova m5, [srcq+src_strideq*2]
+ psubusw m5, m3
+ psubusw m3, [srcq+src_strideq*2]
+ por m3, m5
+ mova m5, [srcq+src_strideq*2+16]
+ psubusw m5, m4
+ psubusw m4, [srcq+src_strideq*2+16]
+ por m4, m5
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD16XN 64 ; highbd_sad_16x64_sse2
+HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
+HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
+HIGH_SAD16XN 8 ; highbd_sad16x8_sse2
+HIGH_SAD16XN 4 ; highbd_sad_16x4_sse2
+HIGH_SAD16XN 64, 1 ; highbd_sad_16x64_avg_sse2
+HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
+HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
+HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2
+HIGH_SAD16XN 4, 1 ; highbd_sad_16x4_avg_sse2
+HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
+HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
+HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
+HIGH_SAD16XN 8, 2 ; highbd_sad_skip_16x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD16XN 4, 2 ; highbd_sad_skip_16x4_sse2
+
+; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD8XN 1-2 0
+ HIGH_SAD_FN 8, %1, 7, %2, 8
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq*2]
+ movu m3, [refq+ref_strideq*4]
+ movu m4, [refq+ref_stride3q*2]
+%if %2 == 1
+ pavgw m1, [second_predq+mmsize*0]
+ pavgw m2, [second_predq+mmsize*1]
+ pavgw m3, [second_predq+mmsize*2]
+ pavgw m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ mova m7, m1
+ movu m5, [srcq]
+ psubusw m1, m5
+ psubusw m5, m7
+ por m1, m5
+
+ mova m7, m2
+ movu m5, [srcq+src_strideq*2]
+ psubusw m2, m5
+ psubusw m5, m7
+ por m2, m5
+
+ mova m7, m3
+ movu m5, [srcq+src_strideq*4]
+ psubusw m3, m5
+ psubusw m5, m7
+ por m3, m5
+
+ mova m7, m4
+ movu m5, [srcq+src_stride3q*2]
+ psubusw m4, m5
+ psubusw m5, m7
+ por m4, m5
+
+ paddw m1, m2
+ paddw m3, m4
+ movhlps m2, m1
+ movhlps m4, m3
+ paddw m1, m2
+ paddw m3, m4
+ punpcklwd m1, m6
+ punpcklwd m3, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD8XN 32 ; highbd_sad_8x32_sse2
+HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
+HIGH_SAD8XN 8 ; highbd_sad8x8_sse2
+HIGH_SAD8XN 4 ; highbd_sad8x4_sse2
+HIGH_SAD8XN 32, 1 ; highbd_sad_8x32_avg_sse2
+HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
+HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2
+HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2
+HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
+HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
+HIGH_SAD8XN 8, 2 ; highbd_sad_skip_8x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD8XN 4, 2 ; highbd_sad8x4_avg_sse2
+
+; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro HIGH_SAD4XN 1-2 0
+ HIGH_SAD_FN 4, %1, 7, %2
+%if %2 == 2 ; skip rows, so divide number of rows by 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+ pxor m6, m6
+
+.loop:
+ movq m1, [refq]
+ movq m2, [refq+ref_strideq*2]
+ movq m3, [refq+ref_strideq*4]
+ movq m4, [refq+ref_stride3q*2]
+ punpcklwd m1, m3
+ punpcklwd m2, m4
+%if %2 == 1
+ movq m3, [second_predq+8*0]
+ movq m5, [second_predq+8*2]
+ punpcklwd m3, m5
+ movq m4, [second_predq+8*1]
+ movq m5, [second_predq+8*3]
+ punpcklwd m4, m5
+ lea second_predq, [second_predq+8*4]
+ pavgw m1, m3
+ pavgw m2, m4
+%endif
+ movq m5, [srcq]
+ movq m3, [srcq+src_strideq*4]
+ punpcklwd m5, m3
+ movdqa m3, m1
+ psubusw m1, m5
+ psubusw m5, m3
+ por m1, m5
+ movq m5, [srcq+src_strideq*2]
+ movq m4, [srcq+src_stride3q*2]
+ punpcklwd m5, m4
+ movdqa m4, m2
+ psubusw m2, m5
+ psubusw m5, m4
+ por m2, m5
+ paddw m1, m2
+ movdqa m2, m1
+ punpcklwd m1, m6
+ punpckhwd m2, m6
+ lea refq, [refq+ref_strideq*8]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*8]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+ punpckldq m0, m6
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
+HIGH_SAD4XN 8 ; highbd_sad4x8_sse2
+HIGH_SAD4XN 4 ; highbd_sad4x4_sse2
+HIGH_SAD4XN 16, 1 ; highbd_sad4x16_avg_sse2
+HIGH_SAD4XN 8, 1 ; highbd_sad4x8_avg_sse2
+HIGH_SAD4XN 4, 1 ; highbd_sad4x4_avg_sse2
+HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
+HIGH_SAD4XN 8, 2 ; highbd_sad_skip_4x8_sse2
+; Current code fails there are only 2 rows
+; HIGH_SAD4XN 4, 2 ; highbd_sad_skip_4x4_sse2
diff --git a/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
new file mode 100644
index 0000000000..c0ccc182b4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subpel_variance_impl_sse2.asm
@@ -0,0 +1,1024 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ mova %4, %3 ; make copies to manipulate to calc sum
+ mova %2, %1 ; use originals for calc sse
+ pmaddwd %3, %3
+ paddw %4, %2
+ pmaddwd %1, %1
+ movhlps %2, %4
+ paddd %6, %3
+ paddw %4, %2
+ pxor %2, %2
+ pcmpgtw %2, %4 ; mask for 0 > %4 (sum)
+ punpcklwd %4, %2 ; sign-extend word to dword
+ paddd %6, %1
+ paddd %5, %4
+
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ movhlps m3, m7
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ pshufd m4, m6, 0x1
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ movd [r1], m7 ; store sse
+ movd eax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+ add srcq, src_stridemp
+%else
+ lea srcq, [srcq + src_strideq*2]
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+
+
+%if AOM_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ; Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal highbd_sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal highbd_sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, height, sse
+ %define block_height heightd
+ %endif
+
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+
+%if %1 < 16
+ sar block_height, 1
+%endif
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq + 16]
+ mova m1, [dstq]
+ mova m3, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m2, [secq+16]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq + src_strideq*2]
+ mova m1, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pavgw m0, m1
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*2+16]
+ mova m2, [dstq]
+ mova m3, [dstq+16]
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m5, [srcq+src_strideq*4]
+ mova m4, m1
+ mova m2, [dstq]
+ mova m3, [dstq+dst_strideq*2]
+ pmullw m1, filter_y_a
+ pmullw m5, filter_y_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m1, m5
+ paddw m0, m4
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ mova m2, [dstq]
+ mova m3, [dstq + 16]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ mova m2, [dstq]
+ mova m3, [dstq + dst_strideq*2]
+ pavgw m0, m4
+ pavgw m1, m5
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + 16]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + 18]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m1, m3
+ mova m4, [dstq]
+ mova m5, [dstq + 16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq + src_strideq*2]
+ movu m4, [srcq + 2]
+ movu m5, [srcq + src_strideq*2 + 2]
+ pavgw m2, m4
+ pavgw m3, m5
+ pavgw m0, m2
+ pavgw m2, m3
+ mova m4, [dstq]
+ mova m5, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+ mova m9, [bilin_filter+y_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+ pavgw m1, m3
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m1, filter_rnd
+ paddw m1, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m1, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ lea srcq, [srcq + src_strideq*2]
+ lea dstq, [dstq + dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ lea srcq, [srcq + src_strideq*2]
+ pavgw m0, m2
+.x_half_y_other_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pavgw m2, m4
+ pavgw m3, m5
+ mova m4, m2
+ mova m5, m3
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m4, filter_rnd
+ paddw m4, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ psrlw m4, 4
+ paddw m0, m2
+ mova m2, [dstq]
+ psrlw m0, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ lea srcq, [srcq + src_strideq*4]
+ lea dstq, [dstq + dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq*2]
+ movu m2, [srcq+2]
+ movu m3, [srcq+src_strideq*2+2]
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m1, m3
+ paddw m0, m2
+ psrlw m1, 4
+ psrlw m0, 4
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m1, [secq]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+2]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+16]
+ movu m4, [srcq+2]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+16]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m1, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m4, m1, m5, m6, m7
+ mova m0, m2
+ mova m1, m3
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+ lea srcq, [srcq+src_strideq*2]
+.x_other_y_half_loop:
+ movu m2, [srcq]
+ movu m3, [srcq+src_strideq*2]
+ movu m4, [srcq+2]
+ movu m5, [srcq+src_strideq*2+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ mova m4, [dstq]
+ mova m5, [dstq+dst_strideq*2]
+ psrlw m2, 4
+ psrlw m3, 4
+ pavgw m0, m2
+ pavgw m2, m3
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m2, [secq]
+%endif
+ SUM_SSE m0, m4, m2, m5, m6, m7
+ mova m0, m3
+
+ lea srcq, [srcq+src_strideq*4]
+ lea dstq, [dstq+dst_strideq*4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+; loading filter - this is same as in 8-bit depth
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift ; filter_idx_shift = 5
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+ mova m9, [bilin_filter+x_offsetq+16]
+ mova m10, [bilin_filter+y_offsetq]
+ mova m11, [bilin_filter+y_offsetq+16]
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+; end of load filter
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ movu m1, [srcq+16]
+ movu m3, [srcq+18]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m1, filter_rnd
+ paddw m0, m2
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ movu m3, [srcq+16]
+ movu m5, [srcq+18]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m1, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m1, filter_rnd
+ mova m2, [dstq]
+ paddw m1, m3
+ psrlw m0, 4
+ psrlw m1, 4
+ mova m3, [dstq+16]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ pavgw m1, [secq+16]
+%endif
+ SUM_SSE m0, m2, m1, m3, m6, m7
+ mova m0, m4
+ mova m1, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 2]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%else ; %1 < 16
+ movu m0, [srcq]
+ movu m2, [srcq+2]
+ pmullw m0, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m2
+ psrlw m0, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movu m2, [srcq]
+ movu m4, [srcq+2]
+ INC_SRC_BY_SRC_STRIDE
+ movu m3, [srcq]
+ movu m5, [srcq+2]
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m3, filter_x_a
+ pmullw m5, filter_x_b
+ paddw m3, filter_rnd
+ paddw m2, m4
+ paddw m3, m5
+ psrlw m2, 4
+ psrlw m3, 4
+ mova m4, m2
+ mova m5, m3
+ pmullw m0, filter_y_a
+ pmullw m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m4, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, m2
+ paddw m4, filter_rnd
+ mova m2, [dstq]
+ paddw m4, m3
+ psrlw m0, 4
+ psrlw m4, 4
+ mova m3, [dstq+dst_strideq*2]
+%if %2 == 1 ; avg
+ pavgw m0, [secq]
+ add secq, sec_str
+ pavgw m4, [secq]
+%endif
+ SUM_SSE m0, m2, m4, m3, m6, m7
+ mova m0, m5
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq + dst_strideq * 4]
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000000..3c3253bdf9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred,
+ ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ int64_t *store_diff = (int64_t *)(diff + 0 * diff_stride);
+
+ u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadl_epi64((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadl_epi64((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadl_epi64((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadl_epi64((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadl_epi64((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadl_epi64((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadl_epi64((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadl_epi64((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadl_epi64((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadl_epi64((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadl_epi64((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadl_epi64((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storel_epi64((__m128i *)store_diff, x0);
+ store_diff = (int64_t *)(diff + 1 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x1);
+ store_diff = (int64_t *)(diff + 2 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x2);
+ store_diff = (int64_t *)(diff + 3 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x3);
+ store_diff = (int64_t *)(diff + 4 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x4);
+ store_diff = (int64_t *)(diff + 5 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x5);
+ store_diff = (int64_t *)(diff + 6 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x6);
+ store_diff = (int64_t *)(diff + 7 * diff_stride);
+ _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3;
+ __m128i v0, v1, v2, v3;
+ __m128i x0, x1, x2, x3;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+ const uint16_t *src, ptrdiff_t src_stride,
+ const uint16_t *pred, ptrdiff_t pred_stride) {
+ __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+ __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+ u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+ u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+ u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+ u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+ u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+ u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+ v0 = _mm_loadu_si128((__m128i const *)(pred + 0 * pred_stride));
+ v1 = _mm_loadu_si128((__m128i const *)(pred + 1 * pred_stride));
+ v2 = _mm_loadu_si128((__m128i const *)(pred + 2 * pred_stride));
+ v3 = _mm_loadu_si128((__m128i const *)(pred + 3 * pred_stride));
+ v4 = _mm_loadu_si128((__m128i const *)(pred + 4 * pred_stride));
+ v5 = _mm_loadu_si128((__m128i const *)(pred + 5 * pred_stride));
+ v6 = _mm_loadu_si128((__m128i const *)(pred + 6 * pred_stride));
+ v7 = _mm_loadu_si128((__m128i const *)(pred + 7 * pred_stride));
+
+ x0 = _mm_sub_epi16(u0, v0);
+ x1 = _mm_sub_epi16(u1, v1);
+ x2 = _mm_sub_epi16(u2, v2);
+ x3 = _mm_sub_epi16(u3, v3);
+ x4 = _mm_sub_epi16(u4, v4);
+ x5 = _mm_sub_epi16(u5, v5);
+ x6 = _mm_sub_epi16(u6, v6);
+ x7 = _mm_sub_epi16(u7, v7);
+
+ _mm_storeu_si128((__m128i *)(diff + 0 * diff_stride), x0);
+ _mm_storeu_si128((__m128i *)(diff + 1 * diff_stride), x1);
+ _mm_storeu_si128((__m128i *)(diff + 2 * diff_stride), x2);
+ _mm_storeu_si128((__m128i *)(diff + 3 * diff_stride), x3);
+ _mm_storeu_si128((__m128i *)(diff + 4 * diff_stride), x4);
+ _mm_storeu_si128((__m128i *)(diff + 5 * diff_stride), x5);
+ _mm_storeu_si128((__m128i *)(diff + 6 * diff_stride), x6);
+ _mm_storeu_si128((__m128i *)(diff + 7 * diff_stride), x7);
+}
+
+#define STACK_V(h, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + diff_stride * h, diff_stride, src + src_stride * h, src_stride, \
+ pred + pred_stride * h, pred_stride); \
+ } while (0)
+
+#define STACK_H(w, fun) \
+ do { \
+ fun(diff, diff_stride, src, src_stride, pred, pred_stride); \
+ fun(diff + w, diff_stride, src + w, src_stride, pred + w, pred_stride); \
+ } while (0)
+
+#define SUBTRACT_FUN(size) \
+ static void subtract_##size(int16_t *diff, ptrdiff_t diff_stride, \
+ const uint16_t *src, ptrdiff_t src_stride, \
+ const uint16_t *pred, ptrdiff_t pred_stride)
+
+SUBTRACT_FUN(8x16) { STACK_V(8, subtract_8x8); }
+SUBTRACT_FUN(16x8) { STACK_H(8, subtract_8x8); }
+SUBTRACT_FUN(16x16) { STACK_V(8, subtract_16x8); }
+SUBTRACT_FUN(16x32) { STACK_V(16, subtract_16x16); }
+SUBTRACT_FUN(32x16) { STACK_H(16, subtract_16x16); }
+SUBTRACT_FUN(32x32) { STACK_V(16, subtract_32x16); }
+SUBTRACT_FUN(32x64) { STACK_V(32, subtract_32x32); }
+SUBTRACT_FUN(64x32) { STACK_H(32, subtract_32x32); }
+SUBTRACT_FUN(64x64) { STACK_V(32, subtract_64x32); }
+SUBTRACT_FUN(64x128) { STACK_V(64, subtract_64x64); }
+SUBTRACT_FUN(128x64) { STACK_H(64, subtract_64x64); }
+SUBTRACT_FUN(128x128) { STACK_V(64, subtract_128x64); }
+SUBTRACT_FUN(4x16) { STACK_V(8, subtract_4x8); }
+SUBTRACT_FUN(16x4) { STACK_H(8, subtract_8x4); }
+SUBTRACT_FUN(8x32) { STACK_V(16, subtract_8x16); }
+SUBTRACT_FUN(32x8) { STACK_H(16, subtract_16x8); }
+SUBTRACT_FUN(16x64) { STACK_V(32, subtract_16x32); }
+SUBTRACT_FUN(64x16) { STACK_H(32, subtract_32x16); }
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+ if (rows == 4) {
+ if (cols == 4) return subtract_4x4;
+ if (cols == 8) return subtract_8x4;
+ if (cols == 16) return subtract_16x4;
+ }
+ if (rows == 8) {
+ if (cols == 4) return subtract_4x8;
+ if (cols == 8) return subtract_8x8;
+ if (cols == 16) return subtract_16x8;
+ if (cols == 32) return subtract_32x8;
+ }
+ if (rows == 16) {
+ if (cols == 4) return subtract_4x16;
+ if (cols == 8) return subtract_8x16;
+ if (cols == 16) return subtract_16x16;
+ if (cols == 32) return subtract_32x16;
+ if (cols == 64) return subtract_64x16;
+ }
+ if (rows == 32) {
+ if (cols == 8) return subtract_8x32;
+ if (cols == 16) return subtract_16x32;
+ if (cols == 32) return subtract_32x32;
+ if (cols == 64) return subtract_64x32;
+ }
+ if (rows == 64) {
+ if (cols == 16) return subtract_16x64;
+ if (cols == 32) return subtract_32x64;
+ if (cols == 64) return subtract_64x64;
+ if (cols == 128) return subtract_128x64;
+ }
+ if (rows == 128) {
+ if (cols == 64) return subtract_64x128;
+ if (cols == 128) return subtract_128x128;
+ }
+ assert(0);
+ return NULL;
+}
+
+void aom_highbd_subtract_block_sse2(int rows, int cols, int16_t *diff,
+ ptrdiff_t diff_stride, const uint8_t *src8,
+ ptrdiff_t src_stride, const uint8_t *pred8,
+ ptrdiff_t pred_stride) {
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ SubtractWxHFuncType func;
+
+ func = getSubtractFunc(rows, cols);
+ func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
new file mode 100644
index 0000000000..b4ff91d856
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_avx2.c
@@ -0,0 +1,904 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/synonyms.h"
+
+typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static uint32_t aom_highbd_var_filter_block2d_bil_avx2(
+ const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step,
+ unsigned int output_height, unsigned int output_width,
+ const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8,
+ int dst_stride, uint32_t *sse) {
+ const __m256i filter1 =
+ _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) |
+ bilinear_filters_2t[xoffset][0]);
+ const __m256i filter2 =
+ _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) |
+ bilinear_filters_2t[yoffset][0]);
+ const __m256i one = _mm256_set1_epi16(1);
+ const int bitshift = 0x40;
+ (void)pixel_step;
+ unsigned int i, j, prev = 0, curr = 2;
+ uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+ uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8);
+ uint16_t *src_ptr_ref = src_ptr;
+ uint16_t *dst_ptr_ref = dst_ptr;
+ int64_t sum_long = 0;
+ uint64_t sse_long = 0;
+ unsigned int rshift = 0, inc = 1;
+ __m256i rbias = _mm256_set1_epi32(bitshift);
+ __m256i opointer[8];
+ unsigned int range;
+ if (xoffset == 0) {
+ if (yoffset == 0) { // xoffset==0 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+ }
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+ for (i = 0; i < 16 / inc; ++i) {
+ __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==0 && yoffset==4
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==0 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr);
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+ } else if (xoffset == 4) {
+ if (yoffset == 0) { // xoffset==4 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==4 && yoffset==4
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==4 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2);
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2);
+
+ __m256i V_S_M1 =
+ _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 =
+ _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+ } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+ __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4
+
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+ __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+ __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+ __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+ __m256i V_H_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+ __m256i V_H_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+ opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < 16 / inc; ++i) {
+ prev = curr;
+ curr = (curr == 0) ? 1 : 0;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+
+ } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7
+ range = output_width / 16;
+ if (output_height == 8) inc = 2;
+ if (output_height == 4) inc = 4;
+ unsigned int nloop = 16 / inc;
+ for (j = 0; j < range * output_height * inc / 16; j++) {
+ if (j % (output_height * inc / 16) == 0) {
+ src_ptr = src_ptr_ref;
+ src_ptr_ref += 16;
+ dst_ptr = dst_ptr_ref;
+ dst_ptr_ref += 16;
+
+ __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+
+ __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2);
+ __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2);
+
+ __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1);
+ __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1);
+
+ __m256i V_H_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7);
+ __m256i V_H_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7);
+
+ opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2);
+
+ curr = 0;
+ }
+
+ __m256i sum1 = _mm256_setzero_si256();
+ __m256i sse1 = _mm256_setzero_si256();
+
+ for (i = 0; i < nloop; ++i) {
+ prev = curr;
+ curr = !curr;
+ __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr);
+ __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1));
+ src_ptr += src_pixels_per_line;
+ __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2);
+ __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1);
+ __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1);
+ __m256i V_V_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7);
+ __m256i V_V_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7);
+ opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2);
+
+ __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]);
+ __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]);
+
+ __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2);
+ __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2);
+
+ __m256i V_S_S1 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7);
+ __m256i V_S_S2 =
+ _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7);
+
+ __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2);
+
+ __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr);
+ dst_ptr += dst_stride;
+
+ __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST);
+ __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB);
+
+ sum1 = _mm256_add_epi16(sum1, V_R_SUB);
+ sse1 = _mm256_add_epi32(sse1, V_R_MAD);
+ }
+
+ __m256i v_sum0 = _mm256_madd_epi16(sum1, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ sum_long += _mm_extract_epi32(v_d, 0);
+ sse_long += _mm_extract_epi32(v_d, 1);
+ }
+
+ rshift = get_msb(output_height) + get_msb(output_width);
+ }
+
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+ int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+
+ int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift);
+
+ return (var > 0) ? var : 0;
+}
+
+void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ for (int i = 0; i < 8; i += 2) {
+ const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src);
+ const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride));
+ const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride));
+ __m256i v_p_a = _mm256_castsi128_si256(v_p_a0);
+ __m256i v_p_b = _mm256_castsi128_si256(v_p_b0);
+ v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1);
+ v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride * 2;
+ ref += ref_stride * 2;
+ }
+ __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d));
+ __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1));
+ __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum) {
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+ const __m256i one = _mm256_set1_epi16(1);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src);
+ const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b);
+ const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff);
+ v_sum_d = _mm256_add_epi16(v_sum_d, v_diff);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff);
+ src += src_stride;
+ ref += ref_stride;
+ }
+ __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one);
+ __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d);
+ __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d);
+ __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h);
+ const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh);
+ const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1);
+ __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d);
+ v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8));
+ *sum = _mm_extract_epi32(v_d, 0);
+ *sse = _mm_extract_epi32(v_d, 1);
+}
+
+static void highbd_10_variance_avx2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_10_variance##w##x##h##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_avx2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+
+#if !CONFIG_REALTIME_ONLY
+VAR_FN(16, 64, 16, 10)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(64, 16, 16, 10)
+VAR_FN(8, 32, 8, 8)
+#endif // !CONFIG_REALTIME_ONLY
+
+#undef VAR_FN
+
+#define SSE2_HEIGHT(H) \
+ uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr);
+
+SSE2_HEIGHT(8)
+SSE2_HEIGHT(16)
+
+#undef SSE2_Height
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+ uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *dst, int dst_stride, uint32_t *sse) { \
+ if (W == 8 && H == 16) \
+ return aom_highbd_10_sub_pixel_variance8x16_sse2( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
+ else if (W == 8 && H == 8) \
+ return aom_highbd_10_sub_pixel_variance8x8_sse2( \
+ src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \
+ else \
+ return aom_highbd_var_filter_block2d_bil_avx2( \
+ src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \
+ }
+
+HIGHBD_SUBPIX_VAR(128, 128)
+HIGHBD_SUBPIX_VAR(128, 64)
+HIGHBD_SUBPIX_VAR(64, 128)
+HIGHBD_SUBPIX_VAR(64, 64)
+HIGHBD_SUBPIX_VAR(64, 32)
+HIGHBD_SUBPIX_VAR(32, 64)
+HIGHBD_SUBPIX_VAR(32, 32)
+HIGHBD_SUBPIX_VAR(32, 16)
+HIGHBD_SUBPIX_VAR(16, 32)
+HIGHBD_SUBPIX_VAR(16, 16)
+HIGHBD_SUBPIX_VAR(16, 8)
+HIGHBD_SUBPIX_VAR(8, 16)
+HIGHBD_SUBPIX_VAR(8, 8)
+
+#undef HIGHBD_SUBPIX_VAR
+
+uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16;
+ __m256i src0_8x16, src1_8x16, src_16x16;
+ __m256i dst0_8x16, dst1_8x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ for (int i = 0; i < h; i += 4) {
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride]));
+ reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride]));
+ dst0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+ dst1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+ dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+ reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16));
+ src1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+ dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+ src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+ dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+ res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+ res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+ res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+ res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+ square_result = _mm256_add_epi64(
+ square_result,
+ _mm256_add_epi64(
+ _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+ const __m128i sum_2x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(square_result),
+ _mm256_extracti128_si256(square_result, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m256i src0_8x16, src1_8x16, src_16x16;
+ __m256i dst0_8x16, dst1_8x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride]));
+ dst1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride]));
+ dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20);
+
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+ src1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros);
+ dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros);
+
+ src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16);
+ dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16);
+
+ res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros);
+ res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros);
+ res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros);
+ res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros);
+
+ square_result = _mm256_add_epi64(
+ square_result,
+ _mm256_add_epi64(
+ _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_2x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(square_result),
+ _mm256_extracti128_si256(square_result, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
new file mode 100644
index 0000000000..ec6c7e9fa7
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_impl_sse2.asm
@@ -0,0 +1,318 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+
+%include "aom_ports/x86_abi_support.asm"
+
+SECTION .text
+
+;unsigned int aom_highbd_calc16x16var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(aom_highbd_calc16x16var_sse2)
+sym(aom_highbd_calc16x16var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+16]
+ prefetcht0 [rsi+rax]
+ prefetcht0 [rsi+rax+16]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+16]
+ prefetcht0 [rdi+rdx]
+ prefetcht0 [rdi+rdx+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 16
+
+.var16loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rax]
+ prefetcht0 [rbx+rax+16]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+16]
+ prefetcht0 [rbx+rdx]
+ prefetcht0 [rbx+rdx+16]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax+16]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx+16]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 2
+ jnz .var16loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+
+;unsigned int aom_highbd_calc8x8var_sse2
+;(
+; unsigned char * src_ptr,
+; int source_stride,
+; unsigned char * ref_ptr,
+; int recon_stride,
+; unsigned int * SSE,
+; int * Sum
+;)
+globalsym(aom_highbd_calc8x8var_sse2)
+sym(aom_highbd_calc8x8var_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;[src_ptr]
+ mov rdi, arg(2) ;[ref_ptr]
+
+ movsxd rax, DWORD PTR arg(1) ;[source_stride]
+ movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
+ add rax, rax ; source stride in bytes
+ add rdx, rdx ; recon stride in bytes
+
+ ; Prefetch data
+ prefetcht0 [rsi]
+ prefetcht0 [rsi+rax]
+ lea rbx, [rsi+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+
+ prefetcht0 [rdi]
+ prefetcht0 [rdi+rdx]
+ lea rbx, [rdi+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm0, xmm0 ; clear xmm0 for unpack
+ pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
+
+ pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
+ mov rcx, 8
+
+.var8loop:
+ movdqu xmm1, XMMWORD PTR [rsi]
+ movdqu xmm2, XMMWORD PTR [rdi]
+
+ lea rbx, [rsi+rax*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rbx+rax*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rax]
+ lea rbx, [rdi+rdx*4]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+ lea rbx, [rbx+rdx*2]
+ prefetcht0 [rbx]
+ prefetcht0 [rbx+rdx]
+
+ pxor xmm5, xmm5
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+
+ psubw xmm3, xmm2
+ movdqu xmm1, XMMWORD PTR [rsi]
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ movdqu xmm2, XMMWORD PTR [rdi]
+ paddd xmm6, xmm3
+
+ psubw xmm1, xmm2
+ movdqu xmm3, XMMWORD PTR [rsi+rax]
+ paddw xmm5, xmm1
+ pmaddwd xmm1, xmm1
+ movdqu xmm2, XMMWORD PTR [rdi+rdx]
+ paddd xmm6, xmm1
+
+ psubw xmm3, xmm2
+ paddw xmm5, xmm3
+ pmaddwd xmm3, xmm3
+ paddd xmm6, xmm3
+
+ movdqa xmm1, xmm5
+ movdqa xmm2, xmm5
+ pcmpgtw xmm1, xmm0
+ pcmpeqw xmm2, xmm0
+ por xmm1, xmm2
+ pcmpeqw xmm1, xmm0
+ movdqa xmm2, xmm5
+ punpcklwd xmm5, xmm1
+ punpckhwd xmm2, xmm1
+ paddd xmm7, xmm5
+ paddd xmm7, xmm2
+
+ lea rsi, [rsi + 2*rax]
+ lea rdi, [rdi + 2*rdx]
+ sub rcx, 4
+ jnz .var8loop
+
+ movdqa xmm4, xmm6
+ punpckldq xmm6, xmm0
+
+ punpckhdq xmm4, xmm0
+ movdqa xmm5, xmm7
+
+ paddd xmm6, xmm4
+ punpckldq xmm7, xmm0
+
+ punpckhdq xmm5, xmm0
+ paddd xmm7, xmm5
+
+ movdqa xmm4, xmm6
+ movdqa xmm5, xmm7
+
+ psrldq xmm4, 8
+ psrldq xmm5, 8
+
+ paddd xmm6, xmm4
+ paddd xmm7, xmm5
+
+ mov rdi, arg(4) ; [SSE]
+ mov rax, arg(5) ; [Sum]
+
+ movd DWORD PTR [rdi], xmm6
+ movd DWORD PTR [rax], xmm7
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ pop rbx
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000000..e897aab645
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse2.c
@@ -0,0 +1,735 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+#include "av1/common/filter.h"
+#include "av1/common/reconinter.h"
+
+typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride,
+ uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+
+ *sse = 0;
+ *sum = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ *sse += sse0;
+ *sum += sum0;
+ }
+ }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+ const uint16_t *ref, int ref_stride, int w,
+ int h, uint32_t *sse, int *sum,
+ high_variance_fn_t var_fn, int block_size) {
+ int i, j;
+ uint64_t sse_long = 0;
+ int32_t sum_long = 0;
+
+ for (i = 0; i < h; i += block_size) {
+ for (j = 0; j < w; j += block_size) {
+ unsigned int sse0;
+ int sum0;
+ var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j,
+ ref_stride, &sse0, &sum0);
+ sse_long += sse0;
+ sum_long += sum0;
+ }
+ }
+ *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+#define VAR_FN(w, h, block_size, shift) \
+ uint32_t aom_highbd_8_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_8_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \
+ } \
+ \
+ uint32_t aom_highbd_10_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_10_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_variance##w##x##h##_sse2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, uint32_t *sse) { \
+ int sum; \
+ int64_t var; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ highbd_12_variance_sse2( \
+ src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+ aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+VAR_FN(128, 128, 16, 14)
+VAR_FN(128, 64, 16, 13)
+VAR_FN(64, 128, 16, 13)
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
+VAR_FN(8, 32, 8, 8)
+VAR_FN(32, 8, 8, 8)
+VAR_FN(16, 64, 16, 10)
+VAR_FN(64, 16, 16, 10)
+
+#undef VAR_FN
+
+unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum,
+ aom_highbd_calc16x16var_sse2, 16);
+ return *sse;
+}
+
+unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+ const uint8_t *ref8, int ref_stride,
+ unsigned int *sse) {
+ int sum;
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum,
+ aom_highbd_calc8x8var_sse2, 8);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in
+// highbd_subpel_variance_impl_sse2.asm
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, int height, \
+ unsigned int *sse, void *unused0, void *unused);
+#define DECLS(opt) \
+ DECL(8, opt) \
+ DECL(16, opt)
+
+DECLS(sse2)
+
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = 0; \
+ unsigned int sse = 0; \
+ unsigned int sse2; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \
+ NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int se = 0; \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src += wd_64 * 64; \
+ dst += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \
+ NULL); \
+ se += se2; \
+ long_sse += sse; \
+ if (w > wf) { \
+ uint32_t sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \
+ &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, h, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+ int start_row; \
+ uint32_t sse; \
+ int se = 0; \
+ int64_t var; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ int row_rep = (w > 64) ? 2 : 1; \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ uint16_t *src_tmp = src + (start_row * src_stride); \
+ uint16_t *dst_tmp = dst + (start_row * dst_stride); \
+ for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \
+ src_tmp += wd_64 * 64; \
+ dst_tmp += wd_64 * 64; \
+ int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \
+ height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \
+ dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 2 * wf, src_stride, x_offset, y_offset, \
+ dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \
+ src_tmp + 3 * wf, src_stride, x_offset, y_offset, \
+ dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)) \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused);
+#define DECLS(opt) \
+ DECL(16, opt) \
+ DECL(8, opt)
+
+DECLS(sse2)
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+ uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \
+ sec + wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \
+ } \
+ \
+ uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int64_t var; \
+ uint32_t sse; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \
+ NULL, NULL); \
+ if (w > wf) { \
+ uint32_t sse2; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \
+ sec + wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \
+ dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \
+ dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 2); \
+ sse = ROUND_POWER_OF_TWO(sse, 4); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+ const uint8_t *sec8) { \
+ int start_row; \
+ int64_t var; \
+ uint32_t sse; \
+ int se = 0; \
+ uint64_t long_sse = 0; \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+ for (start_row = 0; start_row < h; start_row += 16) { \
+ uint32_t sse2; \
+ int height = h - start_row < 16 ? h - start_row : 16; \
+ int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + (start_row * src_stride), src_stride, x_offset, y_offset, \
+ dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \
+ w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + wf + (start_row * dst_stride), dst_stride, \
+ sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \
+ sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+ src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \
+ y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \
+ sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \
+ se += se2; \
+ long_sse += sse2; \
+ } \
+ } \
+ } \
+ se = ROUND_POWER_OF_TWO(se, 4); \
+ sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \
+ *sse_ptr = sse; \
+ var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define FNS(opt) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int64_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int64_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int64_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int64_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int64_t)) \
+ FN(8, 32, 8, 3, 5, opt, (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t))
+
+FNS(sse2)
+
+#undef FNS
+#undef FN
+
+static INLINE void highbd_compute_dist_wtd_comp_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w0,
+ const __m128i *w1,
+ const __m128i *r,
+ void *const result) {
+ assert(DIST_PRECISION_BITS <= 4);
+ __m128i mult0 = _mm_mullo_epi16(*p0, *w0);
+ __m128i mult1 = _mm_mullo_epi16(*p1, *w1);
+ __m128i sum = _mm_adds_epu16(mult0, mult1);
+ __m128i round = _mm_adds_epu16(sum, *r);
+ __m128i shift = _mm_srli_epi16(round, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, shift);
+}
+
+void aom_highbd_dist_wtd_comp_avg_pred_sse2(
+ uint8_t *comp_pred8, const uint8_t *pred8, int width, int height,
+ const uint8_t *ref8, int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i;
+ const int16_t wt0 = (int16_t)jcp_param->fwd_offset;
+ const int16_t wt1 = (int16_t)jcp_param->bck_offset;
+ const __m128i w0 = _mm_set1_epi16(wt0);
+ const __m128i w1 = _mm_set1_epi16(wt1);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+
+ if (width >= 8) {
+ // Read 8 pixels one row at a time
+ assert(!(width & 7));
+ for (i = 0; i < height; ++i) {
+ int j;
+ for (j = 0; j < width; j += 8) {
+ __m128i p0 = xx_loadu_128(ref);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+ comp_pred += 8;
+ pred += 8;
+ ref += 8;
+ }
+ ref += ref_stride - width;
+ }
+ } else {
+ // Read 4 pixels two rows at a time
+ assert(!(width & 3));
+ for (i = 0; i < height; i += 2) {
+ __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+ __m128i p1 = xx_loadu_128(pred);
+
+ highbd_compute_dist_wtd_comp_avg(&p0, &p1, &w0, &w1, &r, comp_pred);
+
+ comp_pred += 8;
+ pred += 8;
+ ref += 2 * ref_stride;
+ }
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i reg0_4x16, reg1_4x16;
+ __m128i src_8x16;
+ __m128i dst_8x16;
+ __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m128i sub_result_8x16;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+ for (int i = 0; i < h; i += 2) {
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+ reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16);
+
+ sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+ res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+ res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+ res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+ res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+ res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+ res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+ res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+ res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+ square_result = _mm_add_epi64(
+ square_result,
+ _mm_add_epi64(
+ _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_1x64 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i src_8x16;
+ __m128i dst_8x16;
+ __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64;
+ __m128i sub_result_8x16;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i++) {
+ dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]);
+ src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+ sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16);
+
+ res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros);
+ res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros);
+
+ res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32);
+ res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32);
+
+ res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros);
+ res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros);
+ res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros);
+ res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros);
+
+ square_result = _mm_add_epi64(
+ square_result,
+ _mm_add_epi64(
+ _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64),
+ res3_4x64));
+ }
+
+ const __m128i sum_1x64 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride,
+ uint16_t *src, int sstride, int w,
+ int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000000..df5449a9df
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/variance.h"
+#include "aom_dsp/aom_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ uint64_t *sse, int64_t *sum) {
+ __m128i u0, u1, u2, u3;
+ __m128i s0, s1, s2, s3;
+ __m128i t0, t1, x0, y0;
+ __m128i a0, a1, a2, a3;
+ __m128i b0, b1, b2, b3;
+ __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+ a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+ a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+ a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+ a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+ b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+ b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+ b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+ b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+ u0 = _mm_unpacklo_epi16(a0, a1);
+ u1 = _mm_unpacklo_epi16(a2, a3);
+ u2 = _mm_unpacklo_epi16(b0, b1);
+ u3 = _mm_unpacklo_epi16(b2, b3);
+
+ s0 = _mm_sub_epi16(u0, u2);
+ s1 = _mm_sub_epi16(u1, u3);
+
+ t0 = _mm_madd_epi16(s0, k_one_epi16);
+ t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ y0 = _mm_hadd_epi32(s3, s3);
+
+ t0 = _mm_madd_epi16(s0, s0);
+ t1 = _mm_madd_epi16(s1, s1);
+
+ s2 = _mm_hadd_epi32(t0, t1);
+ s3 = _mm_hadd_epi32(s2, s2);
+ x0 = _mm_hadd_epi32(s3, s3);
+
+ *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+ *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)local_sse;
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+ sum = ROUND_POWER_OF_TWO(sum, 2);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return (diff >= 0) ? (uint32_t)diff : 0;
+}
+
+uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ uint32_t *sse) {
+ int64_t sum, diff;
+ uint64_t local_sse;
+
+ variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+ sum = ROUND_POWER_OF_TWO(sum, 4);
+
+ diff = (int64_t)*sse - ((sum * sum) >> 4);
+ return diff >= 0 ? (uint32_t)diff : 0;
+}
+
+// Sub-pixel
+uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
+ dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
+ sse);
+}
+
+uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
+
+uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+ const uint8_t *src, int src_stride, int xoffset, int yoffset,
+ const uint8_t *dst, int dst_stride, uint32_t *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[(4 + 1) * 4];
+ uint16_t temp2[4 * 4];
+ DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+ aom_highbd_var_filter_block2d_bil_first_pass(
+ src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
+ aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
+ bilinear_filters_2t[yoffset]);
+
+ aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
+ CONVERT_TO_BYTEPTR(temp2), 4);
+
+ return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
+ dst_stride, sse);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
new file mode 100644
index 0000000000..0eb632326b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_asm_sse2.asm
@@ -0,0 +1,608 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pb_1: times 16 db 1
+pw_4: times 8 dw 4
+pw_8: times 8 dw 8
+pw_16: times 8 dw 16
+pw_32: times 8 dw 32
+dc_128: times 16 db 128
+pw2_4: times 8 dw 2
+pw2_8: times 8 dw 4
+pw2_16: times 8 dw 8
+pw2_32: times 8 dw 16
+
+SECTION .text
+
+INIT_XMM sse2
+cglobal dc_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ movd m2, [leftq]
+ movd m0, [aboveq]
+ pxor m1, m1
+ punpckldq m0, m2
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw_4)]
+ psraw m0, 3
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_4x4, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [leftq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_4x4, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movd m0, [aboveq]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_4)]
+ psraw m0, 2
+ pshuflw m0, m0, 0x0
+ packuswb m0, m0
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ movq m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_8)]
+ psraw m0, 4
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_8x8, 3, 5, 2, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_8x8, 2, 5, 2, dst, stride, above, left, goffset
+ movifnidn leftq, leftmp
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ movq m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ psadbw m0, m1
+ paddw m0, [GLOBAL(pw2_8)]
+ psraw m0, 3
+ punpcklbw m0, m0
+ pshuflw m0, m0, 0x0
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_4x4, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movd m0, [GLOBAL(dc_128)]
+ movd [dstq ], m0
+ movd [dstq+strideq ], m0
+ movd [dstq+strideq*2], m0
+ movd [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_8x8, 2, 5, 1, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [GLOBAL(dc_128)]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_16)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+
+INIT_XMM sse2
+cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ psadbw m0, m1
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_16)]
+ psraw m0, 4
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 4
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+
+INIT_XMM sse2
+cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ mova m3, [leftq]
+ mova m4, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ psadbw m3, m1
+ psadbw m4, m1
+ paddw m0, m2
+ paddw m0, m3
+ paddw m0, m4
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw_32)]
+ psraw m0, 6
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [aboveq]
+ mova m2, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ pxor m1, m1
+ mova m0, [leftq]
+ mova m2, [leftq+16]
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ psadbw m0, m1
+ psadbw m2, m1
+ paddw m0, m2
+ movhlps m2, m0
+ paddw m0, m2
+ paddw m0, [GLOBAL(pw2_32)]
+ psraw m0, 5
+ pshuflw m0, m0, 0x0
+ punpcklqdq m0, m0
+ packuswb m0, m0
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+
+ RESTORE_GOT
+ REP_RET
+
+INIT_XMM sse2
+cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset
+ GET_GOT goffsetq
+
+ DEFINE_ARGS dst, stride, stride3, lines4
+ lea stride3q, [strideq*3]
+ mov lines4d, 8
+ mova m0, [GLOBAL(dc_128)]
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m0
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m0
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m0
+ lea dstq, [dstq+strideq*4]
+ dec lines4d
+ jnz .loop
+ RESTORE_GOT
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above
+ movd m0, [aboveq]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ lea dstq, [dstq+strideq*2]
+ movd [dstq ], m0
+ movd [dstq+strideq], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above
+ movq m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3
+ lea stride3q, [strideq*3]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ movq [dstq ], m0
+ movq [dstq+strideq ], m0
+ movq [dstq+strideq*2], m0
+ movq [dstq+stride3q ], m0
+ RET
+
+INIT_XMM sse2
+cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above
+ mova m0, [aboveq]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 4
+.loop:
+ mova [dstq ], m0
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq*2], m0
+ mova [dstq+stride3q ], m0
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above
+ mova m0, [aboveq]
+ mova m1, [aboveq+16]
+ DEFINE_ARGS dst, stride, stride3, nlines4
+ lea stride3q, [strideq*3]
+ mov nlines4d, 8
+.loop:
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+strideq ], m0
+ mova [dstq+strideq +16], m1
+ mova [dstq+strideq*2 ], m0
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m0
+ mova [dstq+stride3q +16], m1
+ lea dstq, [dstq+strideq*4]
+ dec nlines4d
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_4x4, 2, 4, 4, dst, stride, line, left
+ movifnidn leftq, leftmp
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0
+ pshufd m1, m0, 0x1
+ movd [dstq ], m0
+ movd [dstq+strideq], m1
+ pshufd m2, m0, 0x2
+ lea dstq, [dstq+strideq*2]
+ pshufd m3, m0, 0x3
+ movd [dstq ], m2
+ movd [dstq+strideq], m3
+ RET
+
+INIT_XMM sse2
+cglobal h_predictor_8x8, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -2
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+ movq m0, [leftq ]
+ punpcklbw m0, m0 ; l1 l1 l2 l2 ... l8 l8
+.loop:
+ pshuflw m1, m0, 0x0 ; l1 l1 l1 l1 l1 l1 l1 l1
+ pshuflw m2, m0, 0x55 ; l2 l2 l2 l2 l2 l2 l2 l2
+ movq [dstq ], m1
+ movq [dstq+strideq], m2
+ pshuflw m1, m0, 0xaa
+ pshuflw m2, m0, 0xff
+ movq [dstq+strideq*2], m1
+ movq [dstq+stride3q ], m2
+ pshufd m0, m0, 0xe ; [63:0] l5 l5 l6 l6 l7 l7 l8 l8
+ inc lineq
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_16x16, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -4
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+strideq ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2], m1
+ mova [dstq+stride3q ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
+
+INIT_XMM sse2
+cglobal h_predictor_32x32, 2, 5, 3, dst, stride, line, left
+ movifnidn leftq, leftmp
+ mov lineq, -8
+ DEFINE_ARGS dst, stride, line, left, stride3
+ lea stride3q, [strideq*3]
+.loop:
+ movd m0, [leftq]
+ punpcklbw m0, m0
+ punpcklbw m0, m0 ; l1 to l4 each repeated 4 times
+ pshufd m1, m0, 0x0 ; l1 repeated 16 times
+ pshufd m2, m0, 0x55 ; l2 repeated 16 times
+ mova [dstq ], m1
+ mova [dstq+16 ], m1
+ mova [dstq+strideq ], m2
+ mova [dstq+strideq+16 ], m2
+ pshufd m1, m0, 0xaa
+ pshufd m2, m0, 0xff
+ mova [dstq+strideq*2 ], m1
+ mova [dstq+strideq*2+16], m1
+ mova [dstq+stride3q ], m2
+ mova [dstq+stride3q+16 ], m2
+ inc lineq
+ lea leftq, [leftq+4 ]
+ lea dstq, [dstq+strideq*4]
+ jnz .loop
+ REP_RET
diff --git a/third_party/aom/aom_dsp/x86/intrapred_avx2.c b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
new file mode 100644
index 0000000000..242a548df9
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_avx2.c
@@ -0,0 +1,4707 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m256i dc_sum_64(const uint8_t *ref) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i y0 = _mm256_sad_epu8(x0, zero);
+ __m256i y1 = _mm256_sad_epu8(x1, zero);
+ y0 = _mm256_add_epi64(y0, y1);
+ __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
+ y0 = _mm256_add_epi64(u0, y0);
+ u0 = _mm256_unpackhi_epi64(y0, y0);
+ return _mm256_add_epi16(y0, u0);
+}
+
+static INLINE __m256i dc_sum_32(const uint8_t *ref) {
+ const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i y = _mm256_sad_epu8(x, zero);
+ __m256i u = _mm256_permute2x128_si256(y, y, 1);
+ y = _mm256_add_epi64(u, y);
+ u = _mm256_unpackhi_epi64(y, y);
+ return _mm256_add_epi16(y, u);
+}
+
+static INLINE void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ dst += stride;
+ }
+}
+
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+ int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r0);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+ dst += stride;
+ }
+}
+
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+ dst += stride;
+ }
+}
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
+ { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
+ { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
+ { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
+ { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
+ { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
+ 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
+ { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
+ 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
+ { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25,
+ 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
+ { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
+ 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21,
+ 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
+ { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
+ 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
+};
+
+static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
+ 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
+ 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
+ { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
+ 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
+};
+
+static INLINE void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
+ __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
+
+ r0 = _mm_unpacklo_epi16(x[0], x[1]);
+ r1 = _mm_unpacklo_epi16(x[2], x[3]);
+ r2 = _mm_unpacklo_epi16(x[4], x[5]);
+ r3 = _mm_unpacklo_epi16(x[6], x[7]);
+
+ r4 = _mm_unpacklo_epi16(x[8], x[9]);
+ r5 = _mm_unpacklo_epi16(x[10], x[11]);
+ r6 = _mm_unpacklo_epi16(x[12], x[13]);
+ r7 = _mm_unpacklo_epi16(x[14], x[15]);
+
+ r8 = _mm_unpacklo_epi32(r0, r1);
+ r9 = _mm_unpackhi_epi32(r0, r1);
+ r10 = _mm_unpacklo_epi32(r2, r3);
+ r11 = _mm_unpackhi_epi32(r2, r3);
+
+ r12 = _mm_unpacklo_epi32(r4, r5);
+ r13 = _mm_unpackhi_epi32(r4, r5);
+ r14 = _mm_unpacklo_epi32(r6, r7);
+ r15 = _mm_unpackhi_epi32(r6, r7);
+
+ r0 = _mm_unpacklo_epi64(r8, r9);
+ r1 = _mm_unpackhi_epi64(r8, r9);
+ r2 = _mm_unpacklo_epi64(r10, r11);
+ r3 = _mm_unpackhi_epi64(r10, r11);
+
+ r4 = _mm_unpacklo_epi64(r12, r13);
+ r5 = _mm_unpackhi_epi64(r12, r13);
+ r6 = _mm_unpacklo_epi64(r14, r15);
+ r7 = _mm_unpackhi_epi64(r14, r15);
+
+ d[0] = _mm_unpacklo_epi64(r0, r2);
+ d[1] = _mm_unpacklo_epi64(r4, r6);
+ d[2] = _mm_unpacklo_epi64(r1, r3);
+ d[3] = _mm_unpacklo_epi64(r5, r7);
+
+ d[4] = _mm_unpackhi_epi64(r0, r2);
+ d[5] = _mm_unpackhi_epi64(r4, r6);
+ d[6] = _mm_unpackhi_epi64(r1, r3);
+ d[7] = _mm_unpackhi_epi64(r5, r7);
+}
+
+static INLINE void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
+ w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53
+ w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33
+ w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53
+ w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+
+ w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17
+ w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37
+ w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57
+ w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
+
+ d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
+ d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+ ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
+
+ d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
+ d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
+}
+
+static INLINE void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, ww0, ww1;
+ __m256i dd[16];
+ w0 = _mm256_unpacklo_epi16(x[0], x[1]);
+ w1 = _mm256_unpacklo_epi16(x[2], x[3]);
+ w2 = _mm256_unpacklo_epi16(x[4], x[5]);
+ w3 = _mm256_unpacklo_epi16(x[6], x[7]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); //
+ ww1 = _mm256_unpacklo_epi32(w2, w3); //
+
+ dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); //
+ ww1 = _mm256_unpackhi_epi32(w2, w3); //
+
+ dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpackhi_epi16(x[0], x[1]);
+ w1 = _mm256_unpackhi_epi16(x[2], x[3]);
+ w2 = _mm256_unpackhi_epi16(x[4], x[5]);
+ w3 = _mm256_unpackhi_epi16(x[6], x[7]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1); //
+ ww1 = _mm256_unpacklo_epi32(w2, w3); //
+
+ dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1); //
+ ww1 = _mm256_unpackhi_epi32(w2, w3); //
+
+ dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpacklo_epi16(x[8], x[9]);
+ w1 = _mm256_unpacklo_epi16(x[10], x[11]);
+ w2 = _mm256_unpacklo_epi16(x[12], x[13]);
+ w3 = _mm256_unpacklo_epi16(x[14], x[15]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1);
+ ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+ dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1);
+ ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+ dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ w0 = _mm256_unpackhi_epi16(x[8], x[9]);
+ w1 = _mm256_unpackhi_epi16(x[10], x[11]);
+ w2 = _mm256_unpackhi_epi16(x[12], x[13]);
+ w3 = _mm256_unpackhi_epi16(x[14], x[15]);
+
+ ww0 = _mm256_unpacklo_epi32(w0, w1);
+ ww1 = _mm256_unpacklo_epi32(w2, w3);
+
+ dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ ww0 = _mm256_unpackhi_epi32(w0, w1);
+ ww1 = _mm256_unpackhi_epi32(w2, w3);
+
+ dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
+ dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
+
+ for (int i = 0; i < 8; i++) {
+ d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
+ d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
+ _mm256_extracti128_si256(dd[i], 1), 0);
+ }
+}
+
+void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_32(above);
+ __m256i sum_left = dc_sum_32(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum_left = _mm256_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm256_srai_epi16(sum_left, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum_left, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(left);
+ (void)above;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 32, dst, stride);
+}
+
+// There are 32 rows togeter. This function does line:
+// 0,1,2,3, and 16,17,18,19. The next call would do
+// 4,5,6,7, and 20,21,22,23. So 4 times of calling
+// would finish 32 rows.
+static INLINE void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m256i t[4];
+ __m256i m = _mm256_setzero_si256();
+ const __m256i inc = _mm256_set1_epi8(4);
+ int i;
+
+ for (i = 0; i < 4; i++) {
+ t[i] = _mm256_shuffle_epi8(*row, m);
+ __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
+ __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
+ _mm256_storeu_si256((__m256i *)dst, r0);
+ _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
+ dst += stride;
+ m = _mm256_add_epi8(m, inc);
+ }
+}
+
+void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
+
+ __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
+
+ __m256i v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ u = _mm256_unpackhi_epi8(left_col, left_col);
+
+ v = _mm256_unpacklo_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+ dst += stride << 2;
+
+ v = _mm256_unpackhi_epi8(u, u);
+ h_predictor_32x8line(&v, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// Rectangle
+void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i top_sum = dc_sum_32_sse2(above);
+ __m128i left_sum = dc_sum_16_sse2(left);
+ left_sum = _mm_add_epi16(top_sum, left_sum);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
+ sum += 24;
+ sum /= 48;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_32(above);
+ __m256i sum_left = dc_sum_64(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 48;
+ sum /= 96;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = dc_sum_64(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 64;
+ sum /= 128;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = dc_sum_32(left);
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 48;
+ sum /= 96;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i sum_above = dc_sum_64(above);
+ __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
+ sum_left = _mm256_add_epi16(sum_left, sum_above);
+ uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
+ sum += 40;
+ sum /= 80;
+ const __m256i row = _mm256_set1_epi8((int8_t)sum);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i sum = dc_sum_16_sse2(left);
+ (void)above;
+
+ const __m128i eight = _mm_set1_epi16(8);
+ sum = _mm_add_epi16(sum, eight);
+ sum = _mm_srai_epi16(sum, 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i r = _mm_shuffle_epi8(sum, zero);
+ const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(left);
+ (void)above;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(left);
+ (void)above;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(left);
+ (void)above;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i sum = dc_sum_16_sse2(left);
+ (void)above;
+
+ const __m128i eight = _mm_set1_epi16(8);
+ sum = _mm_add_epi16(sum, eight);
+ sum = _mm_srai_epi16(sum, 4);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i r = _mm_shuffle_epi8(sum, zero);
+ const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((int8_t)0x80);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 16 16-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i base =
+ _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
+
+ __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
+ __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
+ __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
+
+ __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
+ mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
+ __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
+
+ pl = _mm256_andnot_si256(mask1, *left);
+
+ ptl = _mm256_and_si256(mask2, *topleft);
+ pt = _mm256_andnot_si256(mask2, *top);
+ pt = _mm256_or_si256(pt, ptl);
+ pt = _mm256_and_si256(mask1, pt);
+
+ return _mm256_or_si256(pt, pl);
+}
+
+// Return 16 8-bit pixels in one row (__m128i)
+static INLINE __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
+ const __m256i *topleft) {
+ const __m256i p0 = paeth_pred(left, top, topleft);
+ const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i p = _mm256_packus_epi16(p0, p1);
+ return _mm256_castsi256_si128(p);
+}
+
+static INLINE __m256i get_top_vector(const uint8_t *above) {
+ const __m128i x = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t0 = _mm_unpacklo_epi8(x, zero);
+ const __m128i t1 = _mm_unpackhi_epi8(x, zero);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
+}
+
+void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i x = _mm_loadl_epi64((const __m128i *)left);
+ const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+static INLINE __m256i get_left_vector(const uint8_t *left) {
+ const __m128i x = _mm_load_si128((const __m128i *)left);
+ return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
+}
+
+void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i top = get_top_vector(above);
+
+ for (int j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+// Return 32 8-bit pixels in one row (__m256i)
+static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
+ const __m256i *top1,
+ const __m256i *topleft) {
+ __m256i p0 = paeth_pred(left, top0, topleft);
+ __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x0 = _mm256_packus_epi16(p0, p1);
+
+ p0 = paeth_pred(left, top1, topleft);
+ p1 = _mm256_permute4x64_epi64(p0, 0xe);
+ const __m256i x1 = _mm256_packus_epi16(p0, p1);
+
+ return _mm256_permute2x128_si256(x0, x1, 0x20);
+}
+
+void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
+
+ _mm256_storeu_si256((__m256i *)dst, r);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m256i l = get_left_vector(left);
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+
+ l = get_left_vector(left + 16);
+ rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 2; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m256i l = get_left_vector(left + j * 16);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i t0 = get_top_vector(above);
+ const __m256i t1 = get_top_vector(above + 16);
+ const __m256i t2 = get_top_vector(above + 32);
+ const __m256i t3 = get_top_vector(above + 48);
+ const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ int i;
+ const __m256i l = get_left_vector(left);
+ __m256i rep = _mm256_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+
+ const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
+ const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
+ const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
+ const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+
+ dst += stride;
+ rep = _mm256_add_epi16(rep, one);
+ }
+}
+
+#define PERM4x64(c0, c1, c2, c3) c0 + (c1 << 2) + (c2 << 4) + (c3 << 6)
+#define PERM2x128(c0, c1) c0 + (c1 << 4)
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff, c3f;
+ __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+ __m128i a0_128, a1_128;
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+ max_base_x128 = _mm_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
+ a1_128 = _mm_srli_si128(a0_128, 8);
+
+ base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
+ base + 10, base + 12, base + 14);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
+ _mm256_set1_epi16(0x3f)),
+ 1);
+ } else {
+ base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
+ base + 5, base + 6, base + 7);
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_castsi128_si256(a0_128);
+ a1 = _mm256_castsi128_si256(a1_128);
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res1 = _mm256_castsi256_si128(res);
+
+ mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
+ dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((N + 4) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff;
+ __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm_set1_epi16(above[max_base_x]);
+ max_base_x128 = _mm_set1_epi32(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ if (upsample_above) {
+ a0 = _mm256_permutevar8x32_epi32(
+ a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+ base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+ _mm256_set1_epi32(0x3f)),
+ 1);
+ } else {
+ base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+ }
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ res1 = _mm256_castsi256_si128(res);
+ res1 = _mm_packus_epi32(res1, res1);
+
+ mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
+ mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit
+ dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m128i dstvec[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a0_1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi32(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res1, shift;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
+ }
+ return;
+ }
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ if (upsample_above) {
+ a0 = _mm256_permutevar8x32_epi32(
+ a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
+
+ a0_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a0_1 = _mm256_permutevar8x32_epi32(
+ a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
+ a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
+
+ a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
+ a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
+ base_inc256 =
+ _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
+ base + 10, base + 12, base + 14);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
+ _mm256_set1_epi32(0x3f)),
+ 1);
+ } else {
+ base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+ }
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ res1 = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
+ mask256 = _mm256_packs_epi32(
+ mask256, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(mask256, 1))); // goto 16 bit
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ dst[r] = _mm256_castsi256_si128(res1);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
+ int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((8 + N) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res1, shift;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values
+ }
+ return;
+ }
+
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
+ atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+ atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
+ atmp2 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+ atmp3 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
+ mask =
+ _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+ base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
+ base + 8, base + 10, base + 12, base + 14,
+ 0, 0, 0, 0, 0, 0, 0, 0);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7, 0,
+ 0, 0, 0, 0, 0, 0, 0);
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_castsi128_si256(a0_x128);
+ a1 = _mm256_castsi128_si256(a1_x128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ dst[r] = _mm256_castsi256_si128(res1);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m128i dstvec[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((16 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 16 values
+ }
+ return;
+ }
+ __m256i shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+ a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
+ a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+
+ int mdif = max_base_x - base;
+ if (mdif > 8) {
+ a0_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
+ a1_1 =
+ _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7,
+ base + 8, base + 9, base + 10, base + 11,
+ base + 12, base + 13, base + 14, base + 15);
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((16 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 16 values
+ }
+ return;
+ }
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ a0 = _mm256_loadu_si256((__m256i *)(above + base));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16bit values
+
+ base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
+ base + 4, base + 5, base + 6, base + 7,
+ base + 8, base + 9, base + 10, base + 11,
+ base + 12, base + 13, base + 14, base + 15);
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m256i dstvec[64];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec[i + N] = a_mbase_x;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
+
+ for (int j = 0; j < 32; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ res1 = a_mbase_x;
+ } else {
+ a0 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + j)));
+ a1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ if (mdif > 8) {
+ a0_1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
+ a1_1 = _mm256_cvtepu16_epi32(
+ _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ }
+ if (!j) {
+ dstvec[r] = res1;
+ } else {
+ dstvec[r + N] = res1;
+ }
+ }
+ x += dx;
+ }
+}
+
+static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec[i + N] = a_mbase_x;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0; j < 32; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ res = a_mbase_x;
+ } else {
+ a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ }
+ if (!j) {
+ dstvec[r] = res;
+ } else {
+ dstvec[r + N] = res;
+ }
+ }
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx,
+ int bd) {
+ __m256i dstvec[128];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
+ dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
+ upsample_above, dx);
+ }
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
+ }
+}
+
+static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above,
+ int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a0_1, a1, a1_1, a32, a16;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi32(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res[2], res1;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
+
+ __m128i a0_128, a0_1_128, a1_128, a1_1_128;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+ a0 = _mm256_cvtepu16_epi32(a0_128);
+ a1 = _mm256_cvtepu16_epi32(a1_128);
+
+ diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[0] = _mm256_add_epi32(a32, b);
+ res[0] = _mm256_srli_epi32(res[0], 5);
+ res[0] = _mm256_packus_epi32(
+ res[0],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
+ if (mdif > 8) {
+ a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
+ a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
+ a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
+ a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
+
+ diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ res[1] = _mm256_add_epi32(a32, b);
+ res[1] = _mm256_srli_epi32(res[1], 5);
+ res[1] = _mm256_packus_epi32(
+ res[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
+ } else {
+ res[1] = a_mbase_x;
+ }
+ res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
+ 1); // 16 16bit values
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
+ _mm256_storeu_si256((__m256i *)(dst + j), res1);
+ }
+ }
+ x += dx;
+ }
+}
+
+static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
+ ptrdiff_t stride,
+ const uint16_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16, c3f;
+ __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
+ max_base_x256 = _mm256_set1_epi16(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res;
+
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
+ } else {
+ a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
+ a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ base_inc256 = _mm256_setr_epi16(
+ base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
+ base + j + 5, base + j + 6, base + j + 7, base + j + 8,
+ base + j + 9, base + j + 10, base + j + 11, base + j + 12,
+ base + j + 13, base + j + 14, base + j + 15);
+
+ mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
+ res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
+ _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int dx, int dy, int bd) {
+ (void)left;
+ (void)dy;
+
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
+ dx, bd);
+ break;
+ case 64:
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
+ upsample_above, dx);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
+ upsample_above, dx);
+ }
+ break;
+ default: break;
+ }
+ return;
+}
+
+static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst) {
+ __m256i r[16];
+ __m256i d[16];
+ for (int j = 0; j < 16; j++) {
+ r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
+ }
+ highbd_transpose16x16_avx2(r, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
+ }
+}
+
+static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
+ uint16_t *dst, ptrdiff_t pitchDst, int width,
+ int height) {
+ for (int j = 0; j < height; j += 16)
+ for (int i = 0; i < width; i += 16)
+ highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16;
+ __m256i diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm_set1_epi32(0x3f);
+ min_base_y128 = _mm_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(
+ _mm_slli_epi32(
+ _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi32(
+ _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ DECLARE_ALIGNED(32, int, base_y_c[4]);
+ r6 = _mm_set1_epi32(r << 6);
+ dy128 = _mm_set1_epi32(dy);
+ c1234 = _mm_setr_epi32(1, 2, 3, 4);
+ y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
+ base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]]);
+ a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi32(
+ _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resx = _mm_packus_epi32(resx, resx);
+
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi32(resy, resy);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_Nx4_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16;
+ __m256i diff;
+ __m128i c3f, min_base_y128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ a0_x128 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx,
+ (3 << 6) - y * dx, 0, 0, 0, 0),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 2);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+ (3 << 6) - y * dx, 0, 0, 0, 0),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
+ 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resy = _mm256_extracti128_si256(res, 1);
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
+ __m256i diff;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi32(16);
+ c3f = _mm256_set1_epi32(0x3f);
+ min_base_y256 = _mm256_set1_epi32(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx = _mm_setzero_si128();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+ atmp0 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp1 = _mm_shuffle_epi8(a1_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp2 = _mm_shuffle_epi8(
+ a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ atmp3 = _mm_shuffle_epi8(
+ a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+ _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_slli_epi32(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1);
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(
+ _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
+ (3 << 6) - y * dx, (4 << 6) - y * dx,
+ (5 << 6) - y * dx, (6 << 6) - y * dx,
+ (7 << 6) - y * dx),
+ c3f),
+ 1);
+ }
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ }
+ // y calc
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int, base_y_c[8]);
+ __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ dy256 = _mm256_set1_epi32(dy);
+ c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ if (upsample_left) {
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+ }
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy = _mm256_castsi256_si128(_mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resy = resx;
+ }
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_Nx8_avx2(
+ int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i c3f, min_base_y128;
+ __m256i a0_x, a1_x, diff, a32, a16;
+ __m128i a0_x128, a1_x128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ if (upsample_above) {
+ __m128i mask, atmp0, atmp1, atmp2, atmp3;
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
+ atmp0 = _mm_shuffle_epi8(a0_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp1 = _mm_shuffle_epi8(a1_x128,
+ *(__m128i *)HighbdEvenOddMaskx[base_shift]);
+ atmp2 = _mm_shuffle_epi8(
+ a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ atmp3 = _mm_shuffle_epi8(
+ a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
+ mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
+ _mm_set1_epi8(15));
+ a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
+ mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
+ _mm_set1_epi8(15));
+ a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(
+ _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ upsample_above),
+ c3f),
+ 1));
+ } else {
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
+ (2 << 6) - y * dx, (3 << 6) - y * dx,
+ (4 << 6) - y * dx, (5 << 6) - y * dx,
+ (6 << 6) - y * dx, (7 << 6) - y * dx),
+ c3f),
+ 1));
+ }
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
+ left[base_y_c[2] + 1], left[base_y_c[3] + 1],
+ left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm256_castsi256_si128(res);
+ resy = _mm256_extracti128_si256(res, 1);
+
+ resxy =
+ _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_32bit_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
+ __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+ DECLARE_ALIGNED(32, int, base_y_c[16]);
+
+ a16 = _mm256_set1_epi32(16);
+ c1 = _mm256_srli_epi32(a16, 4);
+ c8 = _mm256_srli_epi32(a16, 1);
+ min_base_y256 = _mm256_set1_epi32(min_base_y);
+ c3f = _mm256_set1_epi32(0x3f);
+ dy256 = _mm256_set1_epi32(dy);
+ c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ c1234 = _mm256_add_epi32(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift, ydx;
+ __m256i resx[2], resy[2];
+ __m256i resxy, j256, r6;
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi32(j);
+ int y = r + 1;
+ ydx = _mm256_set1_epi32(y * dx);
+
+ int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x) < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1);
+ }
+ int base_min_diff = (min_base_x - base_x);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx[0] = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu16_epi32(a0_x128);
+ a1_x = _mm256_cvtepu16_epi32(a1_x128);
+
+ r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resx[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+ }
+ int base_shift8 = 0;
+ if ((base_x + 8) < (min_base_x - 1)) {
+ base_shift8 = (min_base_x - (base_x + 8) - 1);
+ }
+ if (base_shift8 > 7) {
+ resx[1] = _mm256_setzero_si256();
+ } else {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift8]);
+
+ a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
+ a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
+
+ r6 = _mm256_slli_epi32(
+ _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
+ shift = _mm256_srli_epi32(
+ _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi32(diff, shift);
+
+ resx[1] = _mm256_add_epi32(a32, b);
+ resx[1] = _mm256_srli_epi32(resx[1], 5);
+ resx[1] = _mm256_packus_epi32(
+ resx[1],
+ _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
+ }
+ resx[0] =
+ _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
+ 1); // 16 16bit values
+
+ // y calc
+ resy[0] = _mm256_setzero_si256();
+ if ((base_x < min_base_x)) {
+ __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
+ r6 = _mm256_set1_epi32(r << 6);
+ c256 = _mm256_add_epi32(j256, c1234);
+ y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+ c256 = _mm256_add_epi32(c256, c8);
+ y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
+ base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]));
+ a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
+ left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
+ left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
+
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[0] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
+ left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
+ left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]));
+ a1_y = _mm256_cvtepu16_epi32(
+ _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
+ left[base_y_c[10] + 1], left[base_y_c[11] + 1],
+ left[base_y_c[12] + 1], left[base_y_c[13] + 1],
+ left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
+ shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
+
+ diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi32(diff, shift);
+ res = _mm256_add_epi32(a32, b);
+ res = _mm256_srli_epi32(res, 5);
+
+ resy[1] = _mm256_packus_epi32(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
+
+ resy[0] =
+ _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
+ 1); // 16 16bit values
+ }
+
+ resxy = _mm256_blendv_epi8(resx[0], resy[0],
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+static void highbd_dr_prediction_z2_HxW_avx2(
+ int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
+ const uint16_t *left, int upsample_above, int upsample_left, int dx,
+ int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0_x, a1_x, a32, a16, c3f, c1;
+ __m256i diff, min_base_y256, dy256, c1234, c0123;
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
+ a16 = _mm256_set1_epi16(16);
+ c1 = _mm256_srli_epi16(a16, 4);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+ dy256 = _mm256_set1_epi16(dy);
+ c0123 =
+ _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ c1234 = _mm256_add_epi16(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift;
+ __m256i resx, resy, ydx;
+ __m256i resxy, j256, r6;
+ __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
+ int y = r + 1;
+ ydx = _mm256_set1_epi16((short)(y * dx));
+
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi16(j);
+ int base_x = ((j << 6) - y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if ((base_x) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x)-1);
+ }
+ int base_min_diff = (min_base_x - base_x);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 8) {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+ a1_x128 =
+ _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
+
+ a0_x = _mm256_castsi128_si256(a0_x128);
+ a1_x = _mm256_castsi128_si256(a1_x128);
+ } else {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ }
+
+ int base_shift1 = 0;
+ if (base_shift > 8) {
+ base_shift1 = base_shift - 8;
+ }
+ if (base_shift1 < 8) {
+ a0_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
+ a1_1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
+ a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift1]);
+ a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
+ *(__m128i *)HighbdLoadMaskx[base_shift1]);
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
+ }
+ r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ resx = _mm256_srli_epi16(res, 5); // 16 16-bit values
+
+ // y calc
+ resy = _mm256_setzero_si256();
+ __m256i a0_y, a1_y, shifty;
+ if ((base_x < min_base_x)) {
+ __m256i c256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ c256 = _mm256_add_epi16(j256, c1234);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ resy = _mm256_srli_epi16(res, 5);
+ }
+
+ resxy = _mm256_blendv_epi8(resx, resy,
+ *(__m256i *)HighbdBaseMask[base_min_diff]);
+ _mm256_storeu_si256((__m256i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_above,
+ int upsample_left, int dx, int dy,
+ int bd) {
+ (void)bd;
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ case 8:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ default:
+ if (bd < 12) {
+ highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ } else {
+ highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left,
+ dx, dy);
+ }
+ break;
+ }
+}
+
+// Directional prediction, zone 3 functions
+static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[4], d[4];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
+ &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+ return;
+}
+
+static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[8], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+ &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[4], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
+ &d[7]);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[8], d[4];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
+ &d[0], &d[1], &d[2], &d[3]);
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[8], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose8x16_16x8_avx2(dstvec, d);
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_extracti128_si256(d[i - 8], 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 16; i += 8) {
+ highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+ &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+ &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+ &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+ &d[5 + i], &d[6 + i], &d[7 + i]);
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+ }
+}
+
+static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[4], d[4], d1;
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose4x16_avx2(dstvec, d);
+ for (int i = 0; i < 4; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ d1 = _mm256_bsrli_epi128(d[i], 8);
+ _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
+ _mm256_castsi256_si128(d1));
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
+ _mm256_extracti128_si256(d1, 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[16], d[8];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose16x4_8x8_sse2(dstvec, d);
+
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
+ _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
+ _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
+ _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
+}
+
+static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
+ upsample_left, dy);
+ }
+
+ for (int i = 0; i < 16; i += 8) {
+ highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+ }
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+}
+
+static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m128i dstvec[32], d[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+
+ for (int i = 0; i < 32; i += 8) {
+ highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
+ &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
+ &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
+ &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
+ &d[5 + i], &d[6 + i], &d[7 + i]);
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
+ }
+}
+
+static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[16], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+
+ highbd_transpose16x16_avx2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[64], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+ highbd_transpose16x16_avx2(dstvec, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 16, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 32, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
+ }
+ highbd_transpose16x16_avx2(dstvec + 48, d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
+ }
+}
+
+static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[32], d[32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 32; i += 8) {
+ highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
+ }
+ // store
+ for (int j = 0; j < 32; j += 16) {
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
+ _mm256_castsi256_si128(d[(i + j)]));
+ }
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
+ _mm256_castsi256_si128(d[(i + j) + 8]));
+ }
+ for (int i = 8; i < 16; i++) {
+ _mm256_storeu_si256(
+ (__m256i *)(dst + (i + j) * stride),
+ _mm256_inserti128_si256(
+ d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
+ }
+ }
+}
+
+static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[32], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 32; i += 16) {
+ highbd_transpose16x16_avx2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ uint16_t dstT[64 * 32];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
+ highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
+ highbd_transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
+ if (bd < 12) {
+ highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
+ dy);
+ }
+ highbd_transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
+ const uint16_t *left,
+ int upsample_left, int dy,
+ int bd) {
+ __m256i dstvec[64], d[16];
+ if (bd < 12) {
+ highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
+ dy);
+ } else {
+ highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
+ upsample_left, dy);
+ }
+ for (int i = 0; i < 64; i += 16) {
+ highbd_transpose16x16_avx2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
+ int bh, const uint16_t *above,
+ const uint16_t *left, int upsample_left,
+ int dx, int dy, int bd) {
+ (void)above;
+ (void)dx;
+
+ assert(dx == 1);
+ assert(dy > 0);
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ case 64:
+ highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
+ bd);
+ break;
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 32:
+ highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 8:
+ highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ case 16:
+ highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
+ dy, bd);
+ break;
+ }
+ }
+ }
+ }
+ return;
+}
+
+// Low bit depth functions
+static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
+ int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ __m256i b, res, shift;
+ __m128i res1, a0_128, a1_128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+ if (base_max_diff > H) base_max_diff = H;
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
+ a1_128 = _mm_srli_si128(a0_128, 8);
+
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(
+ _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+ }
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ res = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // goto 8 bit
+ res1 = _mm256_castsi256_si128(res); // 16 8bit values
+
+ dst[r] =
+ _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[16];
+
+ dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[32];
+
+ dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m128i dstvec[64];
+
+ dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
+ int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, res16[2];
+ __m128i a0_128, a1_128;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res16[jj] = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // 16 8bit values
+ }
+ }
+ res16[1] =
+ _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
+ 1); // 32 8bit values
+
+ dstvec[r] = _mm256_blendv_epi8(
+ a_mbase_x, res16[1],
+ *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ __m256i dstvec[64];
+ dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int upsample_above,
+ int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i a0, a1, a32, a16;
+ __m256i a_mbase_x, diff, c3f;
+ __m128i max_base_x128, base_inc128, mask128;
+
+ a16 = _mm256_set1_epi16(16);
+ a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
+ max_base_x128 = _mm_set1_epi8(max_base_x);
+ c3f = _mm256_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m256i b, res;
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values
+ _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m256i shift =
+ _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
+
+ __m128i a0_128, a1_128, res128;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm_storeu_si128((__m128i *)(dst + j),
+ _mm256_castsi256_si128(a_mbase_x));
+ } else {
+ a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+ a0 = _mm256_cvtepu8_epi16(a0_128);
+ a1 = _mm256_cvtepu8_epi16(a1_128);
+
+ diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm256_mullo_epi16(diff, shift);
+
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+ res = _mm256_packus_epi16(
+ res, _mm256_castsi128_si256(
+ _mm256_extracti128_si256(res, 1))); // 16 8bit values
+
+ base_inc128 =
+ _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+ (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+ (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+ (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+ (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+ (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+ (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+ (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+ mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
+ _mm_setzero_si128());
+ res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
+ _mm256_castsi256_si128(res), mask128);
+ _mm_storeu_si128((__m128i *)(dst + j), res128);
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32:
+ dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 64:
+ dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
+ break;
+ default: break;
+ }
+ return;
+}
+
+static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0_x, a1_x, a32, a16, diff;
+ __m128i c3f, min_base_y128, c1234, dy128;
+
+ a16 = _mm_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+ c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+ dy128 = _mm_set1_epi16(dy);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, shift, r6, ydx;
+ __m128i resx, resy, resxy;
+ __m128i a0_x128, a1_x128;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm_setzero_si128();
+ a1_x = _mm_setzero_si128();
+ shift = _mm_setzero_si128();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(c1234, 6);
+
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 1);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_x128);
+ a1_x = _mm_cvtepu8_epi16(a1_x128);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i y_c128, base_y_c128, mask128, c1234_;
+ c1234_ = _mm_srli_si128(c1234, 2);
+ r6 = _mm_set1_epi16(r << 6);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+ a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+ a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+ shift = _mm_unpacklo_epi64(shift, shifty);
+ }
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(res, res);
+ resy = _mm_srli_si128(resx, 4);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left,
+ int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m256i diff, a32, a16;
+ __m256i a0_x, a1_x;
+ __m128i a0_x128, a1_x128, min_base_y128, c3f;
+ __m128i c1234, dy128;
+
+ a16 = _mm256_set1_epi16(16);
+ c3f = _mm_set1_epi16(0x3f);
+ min_base_y128 = _mm_set1_epi16(min_base_y);
+ dy128 = _mm_set1_epi16(dy);
+ c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+ for (int r = 0; r < N; r++) {
+ __m256i b, res, shift;
+ __m128i resx, resy, resxy, r6, ydx;
+
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ a0_x = _mm256_setzero_si256();
+ a1_x = _mm256_setzero_si256();
+ shift = _mm256_setzero_si256();
+ } else {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+ if (upsample_above) {
+ a0_x128 =
+ _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_x128 = _mm_srli_si128(a0_x128, 8);
+
+ shift = _mm256_castsi128_si256(_mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1));
+ } else {
+ a1_x128 = _mm_srli_si128(a0_x128, 1);
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ shift = _mm256_castsi128_si256(
+ _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
+ }
+ a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
+ a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
+ }
+
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i y_c128, base_y_c128, mask128;
+ r6 = _mm_set1_epi16(r << 6);
+ y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
+ base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
+ mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
+ base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ base_y_c128 = _mm_add_epi16(
+ base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c128);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
+ }
+
+ a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
+ a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
+ shift = _mm256_inserti128_si256(shift, shifty, 1);
+ }
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
+ _mm256_castsi256_si128(res));
+ resy = _mm256_extracti128_si256(res, 1);
+ resy = _mm_packus_epi16(resy, resy);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storel_epi64((__m128i *)(dst), resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
+ __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
+ __m128i a0_x128, a1_x128;
+
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ a16 = _mm256_set1_epi16(16);
+ c1 = _mm256_srli_epi16(a16, 4);
+ min_base_y256 = _mm256_set1_epi16(min_base_y);
+ c3f = _mm256_set1_epi16(0x3f);
+ dy256 = _mm256_set1_epi16(dy);
+ c0123 =
+ _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
+ c1234 = _mm256_add_epi16(c0123, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m256i b, res, shift, j256, r6, ydx;
+ __m128i resx, resy;
+ __m128i resxy;
+ int y = r + 1;
+ ydx = _mm256_set1_epi16((int16_t)(y * dx));
+
+ int base_x = (-y * dx) >> frac_bits_x;
+ for (int j = 0; j < W; j += 16) {
+ j256 = _mm256_set1_epi16(j);
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 16) {
+ a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_x128 =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
+ a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_x = _mm256_cvtepu8_epi16(a0_x128);
+ a1_x = _mm256_cvtepu8_epi16(a1_x128);
+
+ r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
+ shift = _mm256_srli_epi16(
+ _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shift);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resx = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resx = _mm_setzero_si128();
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ __m256i c256, y_c256, base_y_c256, mask256, mul16;
+ r6 = _mm256_set1_epi16(r << 6);
+ c256 = _mm256_add_epi16(j256, c1234);
+ mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
+ _mm256_srli_epi16(min_base_y256, 1));
+ y_c256 = _mm256_sub_epi16(r6, mul16);
+
+ base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
+ mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
+
+ base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
+ int16_t min_y = (int16_t)_mm_extract_epi16(
+ _mm256_extracti128_si256(base_y_c256, 1), 7);
+ int16_t max_y =
+ (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
+ int16_t offset_diff = max_y - min_y;
+
+ if (offset_diff < 16) {
+ __m256i min_y256 = _mm256_set1_epi16(min_y);
+
+ __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
+ __m128i base_y_offset128 =
+ _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
+ _mm256_extracti128_si256(base_y_offset, 1));
+
+ __m128i a0_y128 = _mm_maskload_epi32(
+ (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ __m128i a1_y128 =
+ _mm_maskload_epi32((int *)(left + min_y + 1),
+ *(__m128i *)LoadMaskz2[offset_diff / 4]);
+ a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
+ a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
+ a0_y = _mm256_cvtepu8_epi16(a0_y128);
+ a1_y = _mm256_cvtepu8_epi16(a1_y128);
+ } else {
+ base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a0_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
+ _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
+
+ a1_y = _mm256_setr_epi16(
+ left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
+ left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
+ left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
+ left[base_y_c[15]]);
+ }
+ shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
+
+ diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm256_mullo_epi16(diff, shifty);
+ res = _mm256_add_epi16(a32, b);
+ res = _mm256_srli_epi16(res, 5); // 16 16-bit values
+ resy = _mm256_castsi256_si128(_mm256_packus_epi16(
+ res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
+ } else {
+ resy = _mm_setzero_si128();
+ }
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ break;
+ }
+ return;
+}
+
+// z3 functions
+static INLINE void transpose16x32_avx2(__m256i *x, __m256i *d) {
+ __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m256i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm256_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm256_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm256_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm256_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm256_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm256_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm256_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm256_unpacklo_epi64(w6, w14);
+ d[1] = _mm256_unpackhi_epi64(w6, w14);
+ d[2] = _mm256_unpacklo_epi64(w7, w15);
+ d[3] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm256_unpacklo_epi64(w6, w14);
+ d[5] = _mm256_unpackhi_epi64(w6, w14);
+ d[6] = _mm256_unpacklo_epi64(w7, w15);
+ d[7] = _mm256_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm256_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm256_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm256_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm256_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm256_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm256_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm256_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm256_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm256_unpacklo_epi16(w0, w1);
+ w5 = _mm256_unpacklo_epi16(w2, w3);
+ w12 = _mm256_unpacklo_epi16(w8, w9);
+ w13 = _mm256_unpacklo_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm256_unpacklo_epi64(w6, w14);
+ d[9] = _mm256_unpackhi_epi64(w6, w14);
+ d[10] = _mm256_unpacklo_epi64(w7, w15);
+ d[11] = _mm256_unpackhi_epi64(w7, w15);
+
+ w4 = _mm256_unpackhi_epi16(w0, w1);
+ w5 = _mm256_unpackhi_epi16(w2, w3);
+ w12 = _mm256_unpackhi_epi16(w8, w9);
+ w13 = _mm256_unpackhi_epi16(w10, w11);
+
+ w6 = _mm256_unpacklo_epi32(w4, w5);
+ w7 = _mm256_unpackhi_epi32(w4, w5);
+ w14 = _mm256_unpacklo_epi32(w12, w13);
+ w15 = _mm256_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm256_unpacklo_epi64(w6, w14);
+ d[13] = _mm256_unpackhi_epi64(w6, w14);
+ d[14] = _mm256_unpacklo_epi64(w7, w15);
+ d[15] = _mm256_unpackhi_epi64(w7, w15);
+}
+
+static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[4];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3]);
+
+ *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+ *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+ *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+ *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+ return;
+}
+
+static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+ &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+ &d[3]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+ &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ for (int i = 0; i < 8; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[4];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+ &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
+ transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+ dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+ d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm_srli_si128(d[i], 8));
+ }
+}
+
+static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
+ transpose4x16_sse2(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[8];
+
+ dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
+ for (int i = 4; i < 8; i++) {
+ d[i] = _mm_setzero_si128();
+ }
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i] = _mm256_setzero_si256();
+ }
+ transpose16x32_avx2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride),
+ _mm256_castsi256_si128(d[i]));
+ }
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
+ _mm256_extracti128_si256(d[i], 1));
+ }
+}
+
+static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
+
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose16x8_8x16_sse2(
+ &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+ &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+ &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+ &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+ &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+ &d[6 + 8], &d[7 + 8]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[32], d[32];
+
+ dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ transpose16x32_avx2(dstvec + 16, d + 16);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
+ _mm256_castsi256_si128(d[j + 16]));
+ }
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
+ _mm256_extracti128_si256(d[j + 16], 1));
+ }
+}
+
+static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
+ dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m256i dstvec[16], d[16];
+
+ dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
+ transpose16x32_avx2(dstvec, d);
+ // store
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride),
+ _mm256_castsi256_si128(d[j]));
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
+ _mm256_extracti128_si256(d[j], 1));
+ }
+}
+
+static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 32];
+ dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[32 * 64];
+ dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
+ transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ uint8_t dstT[64 * 16];
+ dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[64], d[16];
+
+ dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 64:
+ dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
+ break;
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse2.c b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 0000000000..61e29731c4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,1411 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4xh(uint32_t dc, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; i += 2) {
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ *(uint32_t *)dst = dc;
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_storel_epi64((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ dst += stride;
+ }
+}
+
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ _mm_store_si128((__m128i *)(dst + 32), *row);
+ _mm_store_si128((__m128i *)(dst + 48), *row);
+ dst += stride;
+ }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_unpacklo_epi8(x, zero);
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+ __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_64(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ __m128i x2 = _mm_load_si128((__m128i const *)(ref + 32));
+ __m128i x3 = _mm_load_si128((__m128i const *)(ref + 48));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x2 = _mm_sad_epu8(x2, zero);
+ x3 = _mm_sad_epu8(x3, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ x2 = _mm_add_epi16(x2, x3);
+ x0 = _mm_add_epi16(x0, x2);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+#define DC_MULTIPLIER_1X2 0x5556
+#define DC_MULTIPLIER_1X4 0x3334
+
+#define DC_SHIFT2 16
+
+static INLINE int divide_using_multiply_shift(int num, int shift1,
+ int multiplier) {
+ const int interm = num >> shift1;
+ return interm * multiplier >> DC_SHIFT2;
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_4(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ __m128i sum_above = dc_sum_4(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 10;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(row);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_4(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 6;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X2);
+
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ __m128i sum_above = dc_sum_8(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 20;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_4(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 10;
+ sum = divide_using_multiply_shift(sum, 2, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_8(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 12;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_64(left);
+ __m128i sum_above = dc_sum_16_sse2(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 40;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_8(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 20;
+ sum = divide_using_multiply_shift(sum, 3, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 24;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sum_left = dc_sum_64(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 48;
+ sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_64(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 64;
+ sum /= 128;
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_32_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 48;
+ sum = divide_using_multiply_shift(sum, 5, DC_MULTIPLIER_1X2);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i sum_left = dc_sum_16_sse2(left);
+ sum_above = _mm_add_epi16(sum_above, sum_left);
+
+ uint32_t sum = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ sum += 40;
+ sum = divide_using_multiply_shift(sum, 4, DC_MULTIPLIER_1X4);
+ const __m128i row = _mm_set1_epi8((int8_t)sum);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_4(above);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_above = _mm_add_epi16(sum_above, two);
+ sum_above = _mm_srai_epi16(sum_above, 2);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_4(above);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_above = _mm_add_epi16(sum_above, two);
+ sum_above = _mm_srai_epi16(sum_above, 2);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_above);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_8(above);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_above = _mm_add_epi16(sum_above, four);
+ sum_above = _mm_srai_epi16(sum_above, 3);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16_sse2(above);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32_sse2(above);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+ const uint32_t pred = (uint32_t)_mm_cvtsi128_si32(sum_left);
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_4(left);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_left = _mm_add_epi16(sum_left, two);
+ sum_left = _mm_srai_epi16(sum_left, 2);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_4(left);
+ const __m128i two = _mm_set1_epi16(2);
+ sum_left = _mm_add_epi16(sum_left, two);
+ sum_left = _mm_srai_epi16(sum_left, 2);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_8(left);
+ const __m128i four = _mm_set1_epi16(4);
+ sum_left = _mm_add_epi16(sum_left, four);
+ sum_left = _mm_srai_epi16(sum_left, 3);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16(32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_left_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_32_sse2(left);
+ const __m128i sixteen = _mm_set1_epi16(16);
+ sum_left = _mm_add_epi16(sum_left, sixteen);
+ sum_left = _mm_srai_epi16(sum_left, 5);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_16_sse2(left);
+ const __m128i eight = _mm_set1_epi16(8);
+ sum_left = _mm_add_epi16(sum_left, eight);
+ sum_left = _mm_srai_epi16(sum_left, 4);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const uint32_t pred = 0x80808080;
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const uint32_t pred = 0x80808080;
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((int8_t)128);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// V_PRED
+
+void aom_v_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t pred = *(uint32_t *)above;
+ (void)left;
+ dc_store_4xh(pred, 8, dst, stride);
+}
+
+void aom_v_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const uint32_t pred = *(uint32_t *)above;
+ (void)left;
+ dc_store_4xh(pred, 16, dst, stride);
+}
+
+void aom_v_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_v_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_loadl_epi64((__m128i const *)above);
+ (void)left;
+ dc_store_8xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 4, dst, stride);
+}
+
+void aom_v_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_v_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
+static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int height) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 8);
+}
+
+void aom_v_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 16);
+}
+
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_32xh(dst, stride, above, 64);
+}
+
+static INLINE void v_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, int height) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+ const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+ for (int i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ _mm_store_si128((__m128i *)(dst + 32), row2);
+ _mm_store_si128((__m128i *)(dst + 48), row3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 64);
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 32);
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)left;
+ v_predictor_64xh(dst, stride, above, 16);
+}
+
+// -----------------------------------------------------------------------------
+// H_PRED
+
+void aom_h_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+ left_col = _mm_unpackhi_epi64(left_col, left_col);
+ row0 = _mm_shufflelo_epi16(left_col, 0);
+ row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_4x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ *(int *)dst = _mm_cvtsi128_si32(row0);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row1);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row2);
+ dst += stride;
+ *(int *)dst = _mm_cvtsi128_si32(row3);
+}
+
+void aom_h_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ __m128i left_col = _mm_loadl_epi64((__m128i const *)left);
+ left_col = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i row0 = _mm_shufflelo_epi16(left_col, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+}
+
+static INLINE void h_predictor_8x16xc(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left,
+ int count) {
+ (void)above;
+ for (int i = 0; i < count; ++i) {
+ const __m128i left_col = _mm_load_si128((__m128i const *)left);
+ __m128i left_col_low = _mm_unpacklo_epi8(left_col, left_col);
+ __m128i left_col_high = _mm_unpackhi_epi8(left_col, left_col);
+
+ __m128i row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ __m128i row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ __m128i row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ __m128i row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_low = _mm_unpackhi_epi64(left_col_low, left_col_low);
+ row0 = _mm_shufflelo_epi16(left_col_low, 0);
+ row1 = _mm_shufflelo_epi16(left_col_low, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_low, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_low, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+
+ left_col_high = _mm_unpackhi_epi64(left_col_high, left_col_high);
+ row0 = _mm_shufflelo_epi16(left_col_high, 0);
+ row1 = _mm_shufflelo_epi16(left_col_high, 0x55);
+ row2 = _mm_shufflelo_epi16(left_col_high, 0xaa);
+ row3 = _mm_shufflelo_epi16(left_col_high, 0xff);
+ _mm_storel_epi64((__m128i *)dst, row0);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row1);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row2);
+ dst += stride;
+ _mm_storel_epi64((__m128i *)dst, row3);
+ dst += stride;
+ left += 16;
+ }
+}
+
+void aom_h_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ h_predictor_8x16xc(dst, stride, above, left, 1);
+}
+
+void aom_h_predictor_8x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ h_predictor_8x16xc(dst, stride, above, left, 2);
+}
+
+static INLINE void h_pred_store_16xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ dst += stride;
+ }
+}
+
+static INLINE void repeat_low_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflelo_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflelo_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflelo_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflelo_epi16(*x, 0xff);
+
+ row[0] = _mm_unpacklo_epi64(u0, u0);
+ row[1] = _mm_unpacklo_epi64(u1, u1);
+ row[2] = _mm_unpacklo_epi64(u2, u2);
+ row[3] = _mm_unpacklo_epi64(u3, u3);
+}
+
+static INLINE void repeat_high_4pixels(const __m128i *x, __m128i *row) {
+ const __m128i u0 = _mm_shufflehi_epi16(*x, 0);
+ const __m128i u1 = _mm_shufflehi_epi16(*x, 0x55);
+ const __m128i u2 = _mm_shufflehi_epi16(*x, 0xaa);
+ const __m128i u3 = _mm_shufflehi_epi16(*x, 0xff);
+
+ row[0] = _mm_unpackhi_epi64(u0, u0);
+ row[1] = _mm_unpackhi_epi64(u1, u1);
+ row[2] = _mm_unpackhi_epi64(u2, u2);
+ row[3] = _mm_unpackhi_epi64(u3, u3);
+}
+
+// Process 16x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_16x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+// Process 16x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_16x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_16xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_16x4_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ const __m128i left_col = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int count) {
+ int i = 0;
+ do {
+ const __m128i left_col = _mm_load_si128((const __m128i *)left);
+ const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
+ dst += stride << 2;
+
+ const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
+ dst += stride << 2;
+ h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
+ dst += stride << 2;
+
+ left += 16;
+ i++;
+ } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 4);
+}
+
+static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < h; ++i) {
+ _mm_store_si128((__m128i *)dst, row[i]);
+ _mm_store_si128((__m128i *)(dst + 16), row[i]);
+ dst += stride;
+ }
+}
+
+// Process 32x8, first 4 rows
+// Use first 8 bytes of left register: xxxxxxxx33221100
+static INLINE void h_prediction_32x8_1(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_low_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+// Process 32x8, second 4 rows
+// Use second 8 bytes of left register: 77665544xxxxxxxx
+static INLINE void h_prediction_32x8_2(const __m128i *left, uint8_t *dst,
+ ptrdiff_t stride) {
+ __m128i row[4];
+ repeat_high_4pixels(left, row);
+ h_pred_store_32xh(row, 4, dst, stride);
+}
+
+void aom_h_predictor_32x8_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+
+ left_col = _mm_load_si128((const __m128i *)left);
+
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+void aom_h_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i left_col, left_col_8p;
+ (void)above;
+
+ left_col = _mm_load_si128((const __m128i *)left);
+
+ left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+ dst += stride << 2;
+
+ left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_32x8_1(&left_col_8p, dst, stride);
+ dst += stride << 2;
+ h_prediction_32x8_2(&left_col_8p, dst, stride);
+}
+
+static INLINE void h_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int height) {
+ int i = height >> 2;
+ do {
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+ const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r0);
+ _mm_store_si128((__m128i *)(dst + stride), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+ const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+ const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+ _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+ left += 4;
+ dst += stride * 4;
+ } while (--i);
+}
+
+void aom_h_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_32xh(dst, stride, left, 64);
+}
+
+static INLINE void h_predictor_64xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int height) {
+ int i = height >> 2;
+ do {
+ __m128i left4 = _mm_cvtsi32_si128(((int *)left)[0]);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ left4 = _mm_unpacklo_epi8(left4, left4);
+ const __m128i r0 = _mm_shuffle_epi32(left4, 0x0);
+ const __m128i r1 = _mm_shuffle_epi32(left4, 0x55);
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r0);
+ _mm_store_si128((__m128i *)(dst + 32), r0);
+ _mm_store_si128((__m128i *)(dst + 48), r0);
+ _mm_store_si128((__m128i *)(dst + stride), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 16), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 32), r1);
+ _mm_store_si128((__m128i *)(dst + stride + 48), r1);
+ const __m128i r2 = _mm_shuffle_epi32(left4, 0xaa);
+ const __m128i r3 = _mm_shuffle_epi32(left4, 0xff);
+ _mm_store_si128((__m128i *)(dst + stride * 2), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 16), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 32), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 2 + 48), r2);
+ _mm_store_si128((__m128i *)(dst + stride * 3), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 16), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 32), r3);
+ _mm_store_si128((__m128i *)(dst + stride * 3 + 48), r3);
+ left += 4;
+ dst += stride * 4;
+ } while (--i);
+}
+
+void aom_h_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 64);
+}
+
+void aom_h_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 32);
+}
+
+void aom_h_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_64xh(dst, stride, left, 16);
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_sse4.c b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
new file mode 100644
index 0000000000..9de8bf3c0f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_sse4.c
@@ -0,0 +1,1307 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "config/av1_rtcd.h"
+#include "aom_dsp/x86/intrapred_x86.h"
+#include "aom_dsp/x86/intrapred_utils.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+// Low bit depth functions
+static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
+ { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
+ 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff } },
+ {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
+ 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
+ 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
+ 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0 },
+ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
+ 0xff, 0xff, 0xff, 0xff },
+ },
+};
+
+/* clang-format on */
+static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
+ int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
+ int dx) {
+ const int frac_bits = 6 - upsample_above;
+ const int max_base_x = ((W + H) - 1) << upsample_above;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i diff, c3f;
+ __m128i a_mbase_x;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < W; r++) {
+ __m128i b, res, res1, shift;
+ __m128i a0_above, a1_above;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base) >> upsample_above;
+ if (base_max_diff <= 0) {
+ for (int i = r; i < W; ++i) {
+ dst[i] = a_mbase_x; // save 4 values
+ }
+ return;
+ }
+ if (base_max_diff > H) base_max_diff = H;
+ a0_above = _mm_loadu_si128((__m128i *)(above + base));
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
+
+ if (upsample_above) {
+ a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
+ 1);
+ } else {
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+ }
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res = _mm_packus_epi16(res, res1);
+
+ dst[r] =
+ _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[32];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[64];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
+ dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ }
+}
+
+static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
+ int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((32 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i a_mbase_x, diff, c3f;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, res1, res16[2];
+ __m128i a0_above, a1_above;
+
+ int base = x >> frac_bits;
+ int base_max_diff = (max_base_x - base);
+ if (base_max_diff <= 0) {
+ for (int i = r; i < N; ++i) {
+ dstvec[i] = a_mbase_x; // save 32 values
+ dstvec_h[i] = a_mbase_x;
+ }
+ return;
+ }
+ if (base_max_diff > 32) base_max_diff = 32;
+ __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
+
+ for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
+ int mdiff = base_max_diff - j;
+ if (mdiff <= 0) {
+ res16[jj] = a_mbase_x;
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
+
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm_mullo_epi16(diff, shift);
+
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values
+ }
+ }
+
+ dstvec[r] =
+ _mm_blendv_epi8(a_mbase_x, res16[0],
+ *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values
+
+ dstvec_h[r] =
+ _mm_blendv_epi8(a_mbase_x, res16[1],
+ *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values
+ x += dx;
+ }
+}
+
+static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ __m128i dstvec[64], dstvec_h[64];
+ dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
+ upsample_above, dx);
+ for (int i = 0; i < N; i++) {
+ _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
+ _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
+ }
+}
+
+static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ int upsample_above, int dx) {
+ // here upsample_above is 0 by design of av1_use_intra_edge_upsample
+ (void)upsample_above;
+ const int frac_bits = 6;
+ const int max_base_x = ((64 + N) - 1);
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0, a1, a32, a16;
+ __m128i a_mbase_x, diff, c3f;
+ __m128i max_base, base_inc, mask;
+
+ a16 = _mm_set1_epi16(16);
+ a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
+ max_base = _mm_set1_epi8(max_base_x);
+ c3f = _mm_set1_epi16(0x3f);
+
+ int x = dx;
+ for (int r = 0; r < N; r++, dst += stride) {
+ __m128i b, res, res1;
+ int base = x >> frac_bits;
+ if (base >= max_base_x) {
+ for (int i = r; i < N; ++i) {
+ _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values
+ _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
+ _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
+ _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
+ dst += stride;
+ }
+ return;
+ }
+
+ __m128i shift =
+ _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element
+
+ __m128i a0_above, a1_above, res_val;
+ for (int j = 0; j < 64; j += 16) {
+ int mdif = max_base_x - (base + j);
+ if (mdif <= 0) {
+ _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
+ } else {
+ a0_above =
+ _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element
+ a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
+
+ // lower half
+ a0 = _mm_cvtepu8_epi16(a0_above);
+ a1 = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+ b = _mm_mullo_epi16(diff, shift);
+
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ // uppar half
+ a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ res = _mm_packus_epi16(res, res1); // 16 8bit values
+
+ base_inc =
+ _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
+ (int8_t)(base + j + 2), (int8_t)(base + j + 3),
+ (int8_t)(base + j + 4), (int8_t)(base + j + 5),
+ (int8_t)(base + j + 6), (int8_t)(base + j + 7),
+ (int8_t)(base + j + 8), (int8_t)(base + j + 9),
+ (int8_t)(base + j + 10), (int8_t)(base + j + 11),
+ (int8_t)(base + j + 12), (int8_t)(base + j + 13),
+ (int8_t)(base + j + 14), (int8_t)(base + j + 15));
+
+ mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
+ _mm_setzero_si128());
+ res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
+ _mm_storeu_si128((__m128i *)(dst + j), res_val);
+ }
+ }
+ x += dx;
+ }
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int dx, int dy) {
+ (void)left;
+ (void)dy;
+ switch (bw) {
+ case 4:
+ dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 8:
+ dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 16:
+ dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 32:
+ dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ case 64:
+ dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ return;
+}
+
+static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ assert(dx > 0);
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i a0_x, a1_x, a32, diff;
+
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
+ const __m128i dy_reg = _mm_set1_epi16(dy);
+ const __m128i a16 = _mm_set1_epi16(16);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, shift, r6, ydx;
+ __m128i resx, resy, resxy;
+ __m128i a0_above, a1_above;
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 4) {
+ base_min_diff = 4;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 3) {
+ a0_x = _mm_setzero_si128();
+ a1_x = _mm_setzero_si128();
+ shift = _mm_setzero_si128();
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(c1234, 6);
+
+ if (upsample_above) {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 1);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+ }
+ // y calc
+ __m128i a0_y, a1_y, shifty;
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
+ __m128i y_c, base_y_c_reg, mask, c1234_;
+ c1234_ = _mm_srli_si128(c1234, 2);
+ r6 = _mm_set1_epi16(r << 6);
+ y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
+ base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+ base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+ base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
+
+ if (upsample_left) {
+ shifty = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+ } else {
+ shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+ }
+ a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
+ a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
+ shift = _mm_unpacklo_epi64(shift, shifty);
+ }
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+
+ resx = _mm_packus_epi16(res, res);
+ resy = _mm_srli_si128(resx, 4);
+
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ *(int *)(dst) = _mm_cvtsi128_si32(resxy);
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ const int min_base_x = -(1 << upsample_above);
+ const int min_base_y = -(1 << upsample_left);
+ const int frac_bits_x = 6 - upsample_above;
+ const int frac_bits_y = 6 - upsample_left;
+
+ // pre-filter above pixels
+ // store in temp buffers:
+ // above[x] * 32 + 16
+ // above[x+1] - above[x]
+ // final pixels will be calculated as:
+ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
+ __m128i diff, a32;
+ __m128i a0_x, a1_x, a0_y, a1_y;
+ __m128i a0_above, a1_above;
+
+ const __m128i a16 = _mm_set1_epi16(16);
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i dy_reg = _mm_set1_epi16(dy);
+ const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
+
+ for (int r = 0; r < N; r++) {
+ __m128i b, res, res1, shift;
+ __m128i resx, resy, resxy, r6, ydx;
+
+ int y = r + 1;
+ int base_x = (-y * dx) >> frac_bits_x;
+ int base_shift = 0;
+ if (base_x < (min_base_x - 1)) {
+ base_shift = (min_base_x - base_x - 1) >> upsample_above;
+ }
+ int base_min_diff =
+ (min_base_x - base_x + upsample_above) >> upsample_above;
+ if (base_min_diff > 8) {
+ base_min_diff = 8;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift > 7) {
+ resx = _mm_setzero_si128();
+ } else {
+ a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
+ ydx = _mm_set1_epi16(y * dx);
+ r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
+ if (upsample_above) {
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
+ a1_above = _mm_srli_si128(a0_above, 8);
+
+ shift = _mm_srli_epi16(
+ _mm_and_si128(
+ _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
+ 1);
+ } else {
+ a1_above = _mm_srli_si128(a0_above, 1);
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above =
+ _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+ }
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5);
+ resx = _mm_packus_epi16(res, res);
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ __m128i y_c, base_y_c_reg, mask;
+ r6 = _mm_set1_epi16(r << 6);
+ y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
+ base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
+ base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
+ _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+
+ if (upsample_left) {
+ shift = _mm_srli_epi16(
+ _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
+ } else {
+ shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
+ }
+
+ diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5);
+
+ resy = _mm_packus_epi16(res1, res1);
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ _mm_storel_epi64((__m128i *)dst, resxy);
+ } else {
+ _mm_storel_epi64((__m128i *)dst, resx);
+ }
+
+ dst += stride;
+ }
+}
+
+static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
+ ptrdiff_t stride, const uint8_t *above,
+ const uint8_t *left, int upsample_above,
+ int upsample_left, int dx, int dy) {
+ // here upsample_above and upsample_left are 0 by design of
+ // av1_use_intra_edge_upsample
+ const int min_base_x = -1;
+ const int min_base_y = -1;
+ (void)upsample_above;
+ (void)upsample_left;
+ const int frac_bits_x = 6;
+ const int frac_bits_y = 6;
+
+ __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
+ __m128i diff, shifty, shifty_h;
+ __m128i a0_above, a1_above;
+
+ DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+ const __m128i a16 = _mm_set1_epi16(16);
+ const __m128i c1 = _mm_srli_epi16(a16, 4);
+ const __m128i min_y_base = _mm_set1_epi16(min_base_y);
+ const __m128i c3f = _mm_set1_epi16(0x3f);
+ const __m128i dy256 = _mm_set1_epi16(dy);
+ const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
+ const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
+ const __m128i c1234 = _mm_add_epi16(c0123, c1);
+ const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
+
+ for (int r = 0; r < H; r++) {
+ __m128i b, res, res1, shift, reg_j, r6, ydx;
+ __m128i resx, resy;
+ __m128i resxy;
+ int y = r + 1;
+ ydx = _mm_set1_epi16((int16_t)(y * dx));
+
+ int base_x = (-y * dx) >> frac_bits_x;
+ for (int j = 0; j < W; j += 16) {
+ reg_j = _mm_set1_epi16(j);
+ int base_shift = 0;
+ if ((base_x + j) < (min_base_x - 1)) {
+ base_shift = (min_base_x - (base_x + j) - 1);
+ }
+ int base_min_diff = (min_base_x - base_x - j);
+ if (base_min_diff > 16) {
+ base_min_diff = 16;
+ } else {
+ if (base_min_diff < 0) base_min_diff = 0;
+ }
+
+ if (base_shift < 16) {
+ a0_above =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
+ a1_above =
+ _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
+ a0_above =
+ _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
+ a1_above =
+ _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
+
+ a0_x = _mm_cvtepu8_epi16(a0_above);
+ a1_x = _mm_cvtepu8_epi16(a1_above);
+
+ r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5); // 16 16-bit values
+
+ a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
+ a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
+
+ r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
+ shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
+
+ diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shift);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
+
+ resx = _mm_packus_epi16(res, res1);
+ } else {
+ resx = _mm_setzero_si128();
+ }
+
+ // y calc
+ if (base_x < min_base_x) {
+ __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
+ __m128i mask, mask_h, mul16, mul16_h;
+ r6 = _mm_set1_epi16(r << 6);
+ c_reg = _mm_add_epi16(reg_j, c1234);
+ c_reg_h = _mm_add_epi16(reg_j, c1234_h);
+ mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
+ _mm_srli_epi16(min_y_base, 1));
+ mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
+ _mm_srli_epi16(min_y_base, 1));
+ y_reg = _mm_sub_epi16(r6, mul16);
+ y_reg_h = _mm_sub_epi16(r6, mul16_h);
+
+ base_y = _mm_srai_epi16(y_reg, frac_bits_y);
+ base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
+ mask = _mm_cmpgt_epi16(min_y_base, base_y);
+ mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
+
+ base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
+ base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
+ int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
+ int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
+ int16_t offset_diff = max_y - min_y;
+
+ if (offset_diff < 16) {
+ __m128i min_y_reg = _mm_set1_epi16(min_y);
+
+ __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
+ __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
+ __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
+
+ __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
+ __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
+ __m128i LoadMask =
+ _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
+
+ a0_mask = _mm_and_si128(a0_mask, LoadMask);
+ a1_mask = _mm_and_si128(a1_mask, LoadMask);
+
+ a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
+ a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
+ a0_y = _mm_cvtepu8_epi16(a0_mask);
+ a1_y = _mm_cvtepu8_epi16(a1_mask);
+ a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
+ a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
+ } else {
+ base_y = _mm_andnot_si128(mask, base_y);
+ base_y_h = _mm_andnot_si128(mask_h, base_y_h);
+ _mm_store_si128((__m128i *)base_y_c, base_y);
+ _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+ a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+ left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]);
+ base_y = _mm_add_epi16(base_y, c1);
+ base_y_h = _mm_add_epi16(base_y_h, c1);
+ _mm_store_si128((__m128i *)base_y_c, base_y);
+ _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
+
+ a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
+ left[base_y_c[2]], left[base_y_c[3]],
+ left[base_y_c[4]], left[base_y_c[5]],
+ left[base_y_c[6]], left[base_y_c[7]]);
+ a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
+ left[base_y_c[10]], left[base_y_c[11]],
+ left[base_y_c[12]], left[base_y_c[13]],
+ left[base_y_c[14]], left[base_y_c[15]]);
+ }
+ shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
+ shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
+
+ diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shifty);
+ res = _mm_add_epi16(a32, b);
+ res = _mm_srli_epi16(res, 5); // 16 16-bit values
+
+ diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x]
+ a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32
+ a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16
+
+ b = _mm_mullo_epi16(diff, shifty_h);
+ res1 = _mm_add_epi16(a32, b);
+ res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values
+ resy = _mm_packus_epi16(res, res1);
+ } else {
+ resy = _mm_setzero_si128();
+ }
+ resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
+ _mm_storeu_si128((__m128i *)(dst + j), resxy);
+ } // for j
+ dst += stride;
+ }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_above, int upsample_left, int dx,
+ int dy) {
+ assert(dx > 0);
+ assert(dy > 0);
+ switch (bw) {
+ case 4:
+ dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ case 8:
+ dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
+ upsample_left, dx, dy);
+ break;
+ default:
+ dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
+ upsample_above, upsample_left, dx, dy);
+ }
+ return;
+}
+
+// z3 functions
+static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[4];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &d[0], &d[1], &d[2], &d[3]);
+
+ *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
+ *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
+ *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
+ *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
+ return;
+}
+
+static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
+ &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
+ &d[3]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
+ _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
+ _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
+}
+
+static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
+ transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
+ &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
+ for (int i = 0; i < 8; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[4];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
+ transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
+ &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
+ &d[1], &d[2], &d[3]);
+ _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
+}
+
+static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[8], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
+ transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
+ dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
+ d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
+ for (int i = 0; i < 8; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
+ _mm_srli_si128(d[i], 8));
+ }
+}
+
+static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[4], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
+ transpose4x16_sse2(dstvec, d);
+ for (int i = 0; i < 16; i++) {
+ *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
+ }
+}
+
+static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[8];
+
+ dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
+ for (int i = 4; i < 8; i++) {
+ d[i] = _mm_setzero_si128();
+ }
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+
+ for (int i = 0; i < 4; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+ dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ for (int i = 8; i < 16; i++) {
+ dstvec[i] = _mm_setzero_si128();
+ dstvec_h[i] = _mm_setzero_si128();
+ }
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
+ }
+ for (int i = 0; i < 16; i++) {
+ _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
+ }
+}
+
+static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int upsample_left,
+ int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
+
+ transpose16x8_8x16_sse2(
+ &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
+ &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
+ &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
+ &d[3], &d[4], &d[5], &d[6], &d[7]);
+ transpose16x8_8x16_sse2(
+ &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
+ &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
+ &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
+ &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
+ &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
+ &d[6 + 8], &d[7 + 8]);
+
+ for (int i = 0; i < 8; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
+ }
+}
+
+static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[16], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+
+ for (int i = 0; i < 16; i++) {
+ _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
+ }
+}
+
+static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
+
+ dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+ transpose16x16_sse2(dstvec + 16, d + 16);
+ transpose16x16_sse2(dstvec_h + 16, d_h + 16);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+ _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
+ }
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
+ }
+}
+
+static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 64];
+ dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 64, 64);
+}
+
+static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
+
+ dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
+ upsample_left, dy);
+ transpose16x16_sse2(dstvec, d);
+ transpose16x16_sse2(dstvec_h, d_h);
+ // store
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
+ _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
+ }
+}
+
+static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[32], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 32; i += 16) {
+ transpose16x16_sse2((dstvec + i), d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 32];
+ dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 32, 64);
+}
+
+static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[32 * 64];
+ dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
+ transpose(dstT, 32, dst, stride, 64, 32);
+ return;
+}
+
+static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ uint8_t dstT[64 * 16];
+ dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
+ transpose(dstT, 64, dst, stride, 16, 64);
+}
+
+static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left,
+ int upsample_left, int dy) {
+ __m128i dstvec[64], d[16];
+
+ dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
+ for (int i = 0; i < 64; i += 16) {
+ transpose16x16_sse2(dstvec + i, d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
+ }
+ }
+}
+
+void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
+ const uint8_t *above, const uint8_t *left,
+ int upsample_left, int dx, int dy) {
+ (void)above;
+ (void)dx;
+ assert(dx == 1);
+ assert(dy > 0);
+
+ if (bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 64:
+ dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ if (bw < bh) {
+ if (bw + bw == bh) {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ switch (bw) {
+ case 4:
+ dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ }
+ } else {
+ if (bh + bh == bw) {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 32:
+ dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ } else {
+ switch (bh) {
+ case 4:
+ dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 8:
+ dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ case 16:
+ dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
+ break;
+ default: assert(0 && "Invalid block size");
+ }
+ }
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_ssse3.c b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
new file mode 100644
index 0000000000..fd48260c6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_ssse3.c
@@ -0,0 +1,2997 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/intrapred_common.h"
+
+// -----------------------------------------------------------------------------
+// PAETH_PRED
+
+// Return 8 16-bit pixels in one row
+static INLINE __m128i paeth_8x1_pred(const __m128i *left, const __m128i *top,
+ const __m128i *topleft) {
+ const __m128i base = _mm_sub_epi16(_mm_add_epi16(*top, *left), *topleft);
+
+ __m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
+ __m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
+ __m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
+
+ __m128i mask1 = _mm_cmpgt_epi16(pl, pt);
+ mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
+ __m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
+
+ pl = _mm_andnot_si128(mask1, *left);
+
+ ptl = _mm_and_si128(mask2, *topleft);
+ pt = _mm_andnot_si128(mask2, *top);
+ pt = _mm_or_si128(pt, ptl);
+ pt = _mm_and_si128(mask1, pt);
+
+ return _mm_or_si128(pl, pt);
+}
+
+void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_cvtsi32_si128(((const int *)above)[0]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ *(int *)dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i t = _mm_loadl_epi64((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i t16 = _mm_unpacklo_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int j = 0; j < 2; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
+
+ _mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+// Return 16 8-bit pixels in one row
+static INLINE __m128i paeth_16x1_pred(const __m128i *left, const __m128i *top0,
+ const __m128i *top1,
+ const __m128i *topleft) {
+ const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
+ const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
+ return _mm_packus_epi16(p0, p1);
+}
+
+void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_cvtsi32_si128(((const int *)left)[0]);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int i = 0; i < 4; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i t = _mm_load_si128((const __m128i *)above);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+ const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (int j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (int i = 0; i < 16; ++i) {
+ const __m128i l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+ _mm_store_si128((__m128i *)dst, row);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i l = _mm_loadl_epi64((const __m128i *)left);
+ __m128i l16;
+
+ for (int i = 0; i < 8; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i l16;
+
+ int i;
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+
+ rep = _mm_set1_epi16((short)0x8000);
+ l = _mm_load_si128((const __m128i *)(left + 16));
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r32l);
+ _mm_store_si128((__m128i *)(dst + 16), r32h);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 2; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i, j;
+ for (j = 0; j < 4; ++j) {
+ const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+ }
+}
+
+void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ const __m128i a = _mm_load_si128((const __m128i *)above);
+ const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
+ const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
+ const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i al = _mm_unpacklo_epi8(a, zero);
+ const __m128i ah = _mm_unpackhi_epi8(a, zero);
+ const __m128i bl = _mm_unpacklo_epi8(b, zero);
+ const __m128i bh = _mm_unpackhi_epi8(b, zero);
+ const __m128i cl = _mm_unpacklo_epi8(c, zero);
+ const __m128i ch = _mm_unpackhi_epi8(c, zero);
+ const __m128i dl = _mm_unpacklo_epi8(d, zero);
+ const __m128i dh = _mm_unpackhi_epi8(d, zero);
+
+ const __m128i tl16 = _mm_set1_epi16((int16_t)above[-1]);
+ const __m128i one = _mm_set1_epi16(1);
+ __m128i l16;
+
+ int i;
+ const __m128i l = _mm_load_si128((const __m128i *)left);
+ __m128i rep = _mm_set1_epi16((short)0x8000);
+ for (i = 0; i < 16; ++i) {
+ l16 = _mm_shuffle_epi8(l, rep);
+ const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
+ const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
+ const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
+ const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
+
+ _mm_store_si128((__m128i *)dst, r0);
+ _mm_store_si128((__m128i *)(dst + 16), r1);
+ _mm_store_si128((__m128i *)(dst + 32), r2);
+ _mm_store_si128((__m128i *)(dst + 48), r3);
+ dst += stride;
+ rep = _mm_add_epi16(rep, one);
+ }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ __m128i d = _mm_cvtsi32_si128(((const int *)above)[0]);
+ if (height == 4)
+ pixels[1] = _mm_cvtsi32_si128(((const int *)left)[0]);
+ else if (height == 8)
+ pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
+ else
+ pixels[1] = _mm_loadu_si128(((const __m128i *)left));
+
+ pixels[2] = _mm_set1_epi16((int16_t)above[3]);
+
+ const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+ const __m128i zero = _mm_setzero_si128();
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(int height, __m128i *weight_h,
+ __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i t = _mm_cvtsi32_si128(((const int *)smooth_weights)[0]);
+ weight_h[0] = _mm_unpacklo_epi8(t, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+
+ if (height == 8) {
+ const __m128i weight = _mm_loadl_epi64((const __m128i *)&smooth_weights[4]);
+ weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ } else if (height == 16) {
+ const __m128i weight =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+ weight_h[0] = _mm_unpacklo_epi8(weight, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ }
+}
+
+static INLINE void smooth_pred_4xh(const __m128i *pixel, const __m128i *wh,
+ const __m128i *ww, int h, uint8_t *dst,
+ ptrdiff_t stride, int second_half) {
+ const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set1_epi32(0xc080400);
+ __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+ : _mm_set1_epi16((short)0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ for (int i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s = _mm_madd_epi16(pixel[0], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixel[1], rep);
+ b = _mm_unpacklo_epi16(b, pixel[2]);
+ __m128i sum = _mm_madd_epi16(b, ww[0]);
+
+ sum = _mm_add_epi32(s, sum);
+ sum = _mm_add_epi32(sum, round);
+ sum = _mm_srai_epi32(sum, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ sum = _mm_shuffle_epi8(sum, gat);
+ *(int *)dst = _mm_cvtsi128_si32(sum);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 4, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(4, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 8, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(8, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[3];
+ load_pixel_w4(above, left, 16, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w4(16, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+ int height, __m128i *pixels) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bp = _mm_set1_epi16((int16_t)left[height - 1]);
+ __m128i d = _mm_loadl_epi64((const __m128i *)above);
+ d = _mm_unpacklo_epi8(d, zero);
+ pixels[0] = _mm_unpacklo_epi16(d, bp);
+ pixels[1] = _mm_unpackhi_epi16(d, bp);
+
+ pixels[3] = _mm_set1_epi16((int16_t)above[7]);
+
+ if (height == 4) {
+ pixels[2] = _mm_cvtsi32_si128(((const int *)left)[0]);
+ } else if (height == 8) {
+ pixels[2] = _mm_loadl_epi64((const __m128i *)left);
+ } else if (height == 16) {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ } else {
+ pixels[2] = _mm_load_si128((const __m128i *)left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(int height, __m128i *weight_h,
+ __m128i *weight_w) {
+ const __m128i zero = _mm_setzero_si128();
+ const int we_offset = height < 8 ? 0 : 4;
+ __m128i we = _mm_loadu_si128((const __m128i *)&smooth_weights[we_offset]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+ const __m128i d = _mm_set1_epi16((int16_t)(1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+
+ if (height == 4) {
+ we = _mm_srli_si128(we, 4);
+ __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
+ __m128i tmp2 = _mm_sub_epi16(d, tmp1);
+ weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
+ weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
+ } else {
+ weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
+ weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
+ }
+
+ if (height == 16) {
+ we = _mm_loadu_si128((const __m128i *)&smooth_weights[12]);
+ weight_h[0] = _mm_unpacklo_epi8(we, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(we, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ } else if (height == 32) {
+ const __m128i weight_lo =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[28]);
+ weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
+ weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
+ weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
+ weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
+ const __m128i weight_hi =
+ _mm_loadu_si128((const __m128i *)&smooth_weights[28 + 16]);
+ weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
+ weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
+ weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
+ weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
+ }
+}
+
+static INLINE void smooth_pred_8xh(const __m128i *pixels, const __m128i *wh,
+ const __m128i *ww, int h, uint8_t *dst,
+ ptrdiff_t stride, int second_half) {
+ const __m128i round = _mm_set1_epi32((1 << SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i inc = _mm_set1_epi16(0x202);
+ const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
+
+ __m128i rep = second_half ? _mm_set1_epi16((short)0x8008)
+ : _mm_set1_epi16((short)0x8000);
+ __m128i d = _mm_set1_epi16(0x100);
+
+ int i;
+ for (i = 0; i < h; ++i) {
+ const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
+ const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
+ const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
+ __m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
+ __m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
+
+ __m128i b = _mm_shuffle_epi8(pixels[2], rep);
+ b = _mm_unpacklo_epi16(b, pixels[3]);
+ __m128i sum0 = _mm_madd_epi16(b, ww[0]);
+ __m128i sum1 = _mm_madd_epi16(b, ww[1]);
+
+ s0 = _mm_add_epi32(s0, sum0);
+ s0 = _mm_add_epi32(s0, round);
+ s0 = _mm_srai_epi32(s0, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ s1 = _mm_add_epi32(s1, sum1);
+ s1 = _mm_add_epi32(s1, round);
+ s1 = _mm_srai_epi32(s1, 1 + SMOOTH_WEIGHT_LOG2_SCALE);
+
+ sum0 = _mm_packus_epi16(s0, s1);
+ sum0 = _mm_shuffle_epi8(sum0, gat);
+ _mm_storel_epi64((__m128i *)dst, sum0);
+ dst += stride;
+
+ rep = _mm_add_epi16(rep, one);
+ d = _mm_add_epi16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 4, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(4, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 8, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(8, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[4];
+ load_pixel_w8(above, left, 16, pixels);
+
+ __m128i wh[4], ww[2];
+ load_weight_w8(16, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i pixels[8];
+ load_pixel_w8(above, left, 32, pixels);
+
+ __m128i wh[8], ww[2];
+ load_weight_w8(32, wh, ww);
+
+ smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+// TODO(slavarnway): Visual Studio only supports restrict when /std:c11
+// (available in 2019+) or greater is specified; __restrict can be used in that
+// case. This should be moved to rtcd and used consistently between the
+// function declarations and definitions to avoid warnings in Visual Studio
+// when defining LIBAOM_RESTRICT to restrict or __restrict.
+#if defined(_MSC_VER)
+#define LIBAOM_RESTRICT
+#else
+#define LIBAOM_RESTRICT restrict
+#endif
+
+static AOM_FORCE_INLINE __m128i Load4(const void *src) {
+ // With new compilers such as clang 8.0.0 we can use the new _mm_loadu_si32
+ // intrinsic. Both _mm_loadu_si32(src) and the code here are compiled into a
+ // movss instruction.
+ //
+ // Until compiler support of _mm_loadu_si32 is widespread, use of
+ // _mm_loadu_si32 is banned.
+ int val;
+ memcpy(&val, src, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static AOM_FORCE_INLINE __m128i LoadLo8(const void *a) {
+ return _mm_loadl_epi64((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE __m128i LoadUnaligned16(const void *a) {
+ return _mm_loadu_si128((const __m128i *)(a));
+}
+
+static AOM_FORCE_INLINE void Store4(void *dst, const __m128i x) {
+ const int val = _mm_cvtsi128_si32(x);
+ memcpy(dst, &val, sizeof(val));
+}
+
+static AOM_FORCE_INLINE void StoreLo8(void *a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE void StoreUnaligned16(void *a, const __m128i v) {
+ _mm_storeu_si128((__m128i *)(a), v);
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi16(__m128i x) {
+ return _mm_unpacklo_epi8((x), _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu8_epi32(__m128i x) {
+ const __m128i tmp = _mm_unpacklo_epi8((x), _mm_setzero_si128());
+ return _mm_unpacklo_epi16(tmp, _mm_setzero_si128());
+}
+
+static AOM_FORCE_INLINE __m128i cvtepu16_epi32(__m128i x) {
+ return _mm_unpacklo_epi16((x), _mm_setzero_si128());
+}
+
+void smooth_predictor_wxh(uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column, int width,
+ int height) {
+ const uint8_t *const sm_weights_h = smooth_weights + height - 4;
+ const uint8_t *const sm_weights_w = smooth_weights + width - 4;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale_value = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i bottom_left = _mm_cvtsi32_si128(left_column[height - 1]);
+ const __m128i top_right = _mm_set1_epi16(top_row[width - 1]);
+ const __m128i round = _mm_set1_epi32(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ for (int y = 0; y < height; ++y) {
+ const __m128i weights_y = _mm_cvtsi32_si128(sm_weights_h[y]);
+ const __m128i left_y = _mm_cvtsi32_si128(left_column[y]);
+ const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
+ __m128i scaled_bottom_left =
+ _mm_mullo_epi16(scale_m_weights_y, bottom_left);
+ const __m128i weight_left_y =
+ _mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
+ scaled_bottom_left = _mm_add_epi32(scaled_bottom_left, round);
+ scaled_bottom_left = _mm_shuffle_epi32(scaled_bottom_left, 0);
+ for (int x = 0; x < width; x += 8) {
+ const __m128i top_x = LoadLo8(top_row + x);
+ const __m128i weights_x = LoadLo8(sm_weights_w + x);
+ const __m128i top_weights_x = _mm_unpacklo_epi8(top_x, weights_x);
+ const __m128i top_weights_x_lo = cvtepu8_epi16(top_weights_x);
+ const __m128i top_weights_x_hi = _mm_unpackhi_epi8(top_weights_x, zero);
+
+ // Here opposite weights and pixels are multiplied, where the order of
+ // interleaving is indicated in the names.
+ __m128i pred_lo = _mm_madd_epi16(top_weights_x_lo, weight_left_y);
+ __m128i pred_hi = _mm_madd_epi16(top_weights_x_hi, weight_left_y);
+
+ // |scaled_bottom_left| is always scaled by the same weight each row, so
+ // we only derive |scaled_top_right| values here.
+ const __m128i inverted_weights_x =
+ _mm_sub_epi16(scale_value, cvtepu8_epi16(weights_x));
+ const __m128i scaled_top_right =
+ _mm_mullo_epi16(inverted_weights_x, top_right);
+ const __m128i scaled_top_right_lo = cvtepu16_epi32(scaled_top_right);
+ const __m128i scaled_top_right_hi =
+ _mm_unpackhi_epi16(scaled_top_right, zero);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_bottom_left);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_bottom_left);
+ pred_lo = _mm_add_epi32(pred_lo, scaled_top_right_lo);
+ pred_hi = _mm_add_epi32(pred_hi, scaled_top_right_hi);
+
+ // The round value for RightShiftWithRounding was added with
+ // |scaled_bottom_left|.
+ pred_lo = _mm_srli_epi32(pred_lo, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+ pred_hi = _mm_srli_epi32(pred_hi, (1 + SMOOTH_WEIGHT_LOG2_SCALE));
+ const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+ StoreLo8(dst + x, _mm_packus_epi16(pred, pred));
+ }
+ dst += stride;
+ }
+}
+
+void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
+
+void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+// -----------------------------------------------------------------------------
+// Smooth horizontal/vertical helper functions.
+
+// For Horizontal, pixels1 and pixels2 are the same repeated value. For
+// Vertical, weights1 and weights2 are the same, and scaled_corner1 and
+// scaled_corner2 are the same.
+static AOM_FORCE_INLINE void write_smooth_directional_sum16(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i pixels1, const __m128i pixels2,
+ const __m128i weights1, const __m128i weights2,
+ const __m128i scaled_corner1, const __m128i scaled_corner2,
+ const __m128i round) {
+ const __m128i weighted_px1 = _mm_mullo_epi16(pixels1, weights1);
+ const __m128i weighted_px2 = _mm_mullo_epi16(pixels2, weights2);
+ const __m128i pred_sum1 = _mm_add_epi16(scaled_corner1, weighted_px1);
+ const __m128i pred_sum2 = _mm_add_epi16(scaled_corner2, weighted_px2);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred1 = _mm_srli_epi16(_mm_add_epi16(pred_sum1, round), 8);
+ const __m128i pred2 = _mm_srli_epi16(_mm_add_epi16(pred_sum2, round), 8);
+ StoreUnaligned16(dst, _mm_packus_epi16(pred1, pred2));
+}
+
+static AOM_FORCE_INLINE __m128i smooth_directional_sum8(
+ const __m128i pixels, const __m128i weights, const __m128i scaled_corner) {
+ const __m128i weighted_px = _mm_mullo_epi16(pixels, weights);
+ return _mm_add_epi16(scaled_corner, weighted_px);
+}
+
+static AOM_FORCE_INLINE void write_smooth_directional_sum8(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i *pixels, const __m128i *weights,
+ const __m128i *scaled_corner, const __m128i *round) {
+ const __m128i pred_sum =
+ smooth_directional_sum8(*pixels, *weights, *scaled_corner);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi16(_mm_add_epi16(pred_sum, *round), 8);
+ StoreLo8(dst, _mm_packus_epi16(pred, pred));
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_V_PRED
+
+static AOM_FORCE_INLINE void load_smooth_vertical_pixels4(
+ const uint8_t *LIBAOM_RESTRICT above, const uint8_t *LIBAOM_RESTRICT left,
+ const int height, __m128i *pixels) {
+ __m128i top = Load4(above);
+ const __m128i bottom_left = _mm_set1_epi16(left[height - 1]);
+ top = cvtepu8_epi16(top);
+ pixels[0] = _mm_unpacklo_epi16(top, bottom_left);
+}
+
+// |weight_array| alternates weight vectors from the table with their inverted
+// (256-w) counterparts. This is precomputed by the compiler when the weights
+// table is visible to this module. Removing this visibility can cut speed by up
+// to half in both 4xH and 8xH transforms.
+static AOM_FORCE_INLINE void load_smooth_vertical_weights4(
+ const uint8_t *LIBAOM_RESTRICT weight_array, const int height,
+ __m128i *weights) {
+ const __m128i inverter = _mm_set1_epi16(256);
+
+ if (height == 4) {
+ const __m128i weight = Load4(weight_array);
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else if (height == 8) {
+ const __m128i weight = LoadLo8(weight_array + 4);
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ } else {
+ const __m128i weight = LoadUnaligned16(weight_array + 12);
+ const __m128i zero = _mm_setzero_si128();
+ weights[0] = cvtepu8_epi16(weight);
+ weights[1] = _mm_sub_epi16(inverter, weights[0]);
+ weights[2] = _mm_unpackhi_epi8(weight, zero);
+ weights[3] = _mm_sub_epi16(inverter, weights[2]);
+ }
+}
+
+static AOM_FORCE_INLINE void write_smooth_vertical4xh(
+ const __m128i *pixel, const __m128i *weight, const int height,
+ uint8_t *LIBAOM_RESTRICT dst, const ptrdiff_t stride) {
+ const __m128i pred_round = _mm_set1_epi32(128);
+ const __m128i mask_increment = _mm_set1_epi16(0x0202);
+ const __m128i cvtepu8_epi32 = _mm_set1_epi32(0xC080400);
+ __m128i y_select = _mm_set1_epi16(0x0100);
+
+ for (int y = 0; y < height; ++y) {
+ const __m128i weight_y = _mm_shuffle_epi8(weight[0], y_select);
+ const __m128i inverted_weight_y = _mm_shuffle_epi8(weight[1], y_select);
+ const __m128i alternate_weights =
+ _mm_unpacklo_epi16(weight_y, inverted_weight_y);
+ // Here the pixel vector is top_row[0], corner, top_row[1], corner, ...
+ // The madd instruction yields four results of the form:
+ // (top_row[x] * weight[y] + corner * inverted_weight[y])
+ __m128i sum = _mm_madd_epi16(pixel[0], alternate_weights);
+ sum = _mm_add_epi32(sum, pred_round);
+ sum = _mm_srai_epi32(sum, 8);
+ sum = _mm_shuffle_epi8(sum, cvtepu8_epi32);
+ Store4(dst, sum);
+ dst += stride;
+ y_select = _mm_add_epi16(y_select, mask_increment);
+ }
+}
+
+void aom_smooth_v_predictor_4x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 4, &pixels);
+
+ __m128i weights[2];
+ load_smooth_vertical_weights4(smooth_weights, 4, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 4, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 8, &pixels);
+
+ __m128i weights[2];
+ load_smooth_vertical_weights4(smooth_weights, 8, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_4x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ __m128i pixels;
+ load_smooth_vertical_pixels4(top_row, left_column, 16, &pixels);
+
+ __m128i weights[4];
+ load_smooth_vertical_weights4(smooth_weights, 16, weights);
+
+ write_smooth_vertical4xh(&pixels, weights, 8, dst, stride);
+ dst += stride << 3;
+ write_smooth_vertical4xh(&pixels, &weights[2], 8, dst, stride);
+}
+
+void aom_smooth_v_predictor_8x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+ const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+}
+
+void aom_smooth_v_predictor_8x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_8x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_8x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const __m128i top = cvtepu8_epi16(LoadLo8(top_row));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum8(dst, &top, &weights_y, &scaled_bottom_left_y,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[3]);
+ const __m128i weights = cvtepu8_epi16(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ __m128i scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ weights_y = _mm_shuffle_epi8(weights, y_select);
+ scaled_bottom_left_y = _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+}
+
+void aom_smooth_v_predictor_16x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = cvtepu8_epi16(_mm_srli_si128(top, 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(128);
+
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_16x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(128);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top = LoadUnaligned16(top_row);
+ const __m128i top_lo = cvtepu8_epi16(top);
+ const __m128i top_hi = _mm_unpackhi_epi8(top, zero);
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top_lo, top_hi, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_v_predictor_32x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[7]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ __m128i scale = _mm_set1_epi16(256);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_bottom_left =
+ _mm_mullo_epi16(inverted_weights, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_32x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i top_lo = LoadUnaligned16(top_row);
+ const __m128i top_hi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_hi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_v_predictor_64x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i bottom_left = _mm_set1_epi16(left_column[15]);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_64x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[31]);
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = _mm_unpackhi_epi8(weights_lo, zero);
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = _mm_unpackhi_epi8(weights_hi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_bottom_left1 =
+ _mm_mullo_epi16(inverted_weights1, bottom_left);
+ const __m128i scaled_bottom_left2 =
+ _mm_mullo_epi16(inverted_weights2, bottom_left);
+ const __m128i scaled_bottom_left3 =
+ _mm_mullo_epi16(inverted_weights3, bottom_left);
+ const __m128i scaled_bottom_left4 =
+ _mm_mullo_epi16(inverted_weights4, bottom_left);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights1, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left1, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights2, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left2, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights3, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left3, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights4, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left4, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_v_predictor_64x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i bottom_left = _mm_set1_epi16(left_column[63]);
+ const __m128i top_lolo = LoadUnaligned16(top_row);
+ const __m128i top_lohi = LoadUnaligned16(top_row + 16);
+ const __m128i top1 = cvtepu8_epi16(top_lolo);
+ const __m128i top2 = _mm_unpackhi_epi8(top_lolo, zero);
+ const __m128i top3 = cvtepu8_epi16(top_lohi);
+ const __m128i top4 = _mm_unpackhi_epi8(top_lohi, zero);
+ const __m128i top_hilo = LoadUnaligned16(top_row + 32);
+ const __m128i top_hihi = LoadUnaligned16(top_row + 48);
+ const __m128i top5 = cvtepu8_epi16(top_hilo);
+ const __m128i top6 = _mm_unpackhi_epi8(top_hilo, zero);
+ const __m128i top7 = cvtepu8_epi16(top_hihi);
+ const __m128i top8 = _mm_unpackhi_epi8(top_hihi, zero);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i round = _mm_set1_epi16(128);
+ const uint8_t *weights_base_ptr = smooth_weights + 60;
+ for (int left_offset = 0; left_offset < 64; left_offset += 16) {
+ const __m128i weights = LoadUnaligned16(weights_base_ptr + left_offset);
+ const __m128i weights_lo = cvtepu8_epi16(weights);
+ const __m128i weights_hi = _mm_unpackhi_epi8(weights, zero);
+ const __m128i inverted_weights_lo = _mm_sub_epi16(scale, weights_lo);
+ const __m128i inverted_weights_hi = _mm_sub_epi16(scale, weights_hi);
+ const __m128i scaled_bottom_left_lo =
+ _mm_mullo_epi16(inverted_weights_lo, bottom_left);
+ const __m128i scaled_bottom_left_hi =
+ _mm_mullo_epi16(inverted_weights_hi, bottom_left);
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_lo, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_lo, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i weights_y = _mm_shuffle_epi8(weights_hi, y_select);
+ const __m128i scaled_bottom_left_y =
+ _mm_shuffle_epi8(scaled_bottom_left_hi, y_select);
+ write_smooth_directional_sum16(dst, top1, top2, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 16, top3, top4, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 32, top5, top6, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ write_smooth_directional_sum16(dst + 48, top7, top8, weights_y, weights_y,
+ scaled_bottom_left_y, scaled_bottom_left_y,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+// -----------------------------------------------------------------------------
+// SMOOTH_H_PRED
+static AOM_FORCE_INLINE void write_smooth_horizontal_sum4(
+ uint8_t *LIBAOM_RESTRICT dst, const __m128i *left_y, const __m128i *weights,
+ const __m128i *scaled_top_right, const __m128i *round) {
+ const __m128i weighted_left_y = _mm_mullo_epi16(*left_y, *weights);
+ const __m128i pred_sum = _mm_add_epi32(*scaled_top_right, weighted_left_y);
+ // Equivalent to RightShiftWithRounding(pred[x][y], 8).
+ const __m128i pred = _mm_srli_epi32(_mm_add_epi32(pred_sum, *round), 8);
+ const __m128i cvtepi32_epi8 = _mm_set1_epi32(0x0C080400);
+ Store4(dst, _mm_shuffle_epi8(pred, cvtepi32_epi8));
+}
+
+void aom_smooth_h_predictor_4x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i left = cvtepu8_epi32(Load4(left_column));
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_4x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi32(Load4(left_column));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 4));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_4x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi32(top_row[3]);
+ const __m128i weights = cvtepu8_epi32(Load4(smooth_weights));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi32(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi32(Load4(left_column));
+ __m128i left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 4));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 8));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+
+ left = cvtepu8_epi32(Load4(left_column + 12));
+ left_y = _mm_shuffle_epi32(left, 0);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0x55);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xaa);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ left_y = _mm_shuffle_epi32(left, 0xff);
+ write_smooth_horizontal_sum4(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+// For SMOOTH_H, |pixels| is the repeated left value for the row. For SMOOTH_V,
+// |pixels| is a segment of the top row or the whole top row, and |weights| is
+// repeated.
+void aom_smooth_h_predictor_8x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i left = cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_select = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ y_select = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+}
+
+void aom_smooth_h_predictor_8x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_8x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_8x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[7]);
+ const __m128i weights = cvtepu8_epi16(LoadLo8(smooth_weights + 4));
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i inverted_weights = _mm_sub_epi16(scale, weights);
+ const __m128i scaled_top_right = _mm_mullo_epi16(inverted_weights, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum8(dst, &left_y, &weights, &scaled_top_right,
+ &round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x4_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i left = cvtepu8_epi16(Load4(left_column));
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i y_mask = _mm_set1_epi32(0x01000100);
+ __m128i left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x03020302);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x05040504);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ y_mask = _mm_set1_epi32(0x07060706);
+ left_y = _mm_shuffle_epi8(left, y_mask);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+}
+
+void aom_smooth_h_predictor_16x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_16x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[15]);
+ const __m128i weights = LoadUnaligned16(smooth_weights + 12);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_h_predictor_32x8_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ const __m128i left2 =
+ cvtepu8_epi16(LoadLo8((const uint8_t *)left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ __m128i left = cvtepu8_epi16(LoadLo8(left_column));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+ left = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_32x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[31]);
+ const __m128i weights_lo = LoadUnaligned16(smooth_weights + 28);
+ const __m128i weights_hi = LoadUnaligned16(smooth_weights + 44);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_hi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_hi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+ weights4, scaled_top_right3,
+ scaled_top_right4, round);
+ dst += stride;
+ }
+ }
+}
+
+void aom_smooth_h_predictor_64x16_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ __m128i y_select = _mm_set1_epi32(y_mask);
+ __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_64x32_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i left1 = cvtepu8_epi16(LoadLo8(left_column));
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left1, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left2 = cvtepu8_epi16(LoadLo8(left_column + 8));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left2, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left3 = cvtepu8_epi16(LoadLo8(left_column + 16));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left3, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+ const __m128i left4 = cvtepu8_epi16(LoadLo8(left_column + 24));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left4, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2, round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3, weights4,
+ scaled_top_right3, scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5, weights6,
+ scaled_top_right5, scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7, weights8,
+ scaled_top_right7, scaled_top_right8, round);
+ dst += stride;
+ }
+}
+
+void aom_smooth_h_predictor_64x64_ssse3(
+ uint8_t *LIBAOM_RESTRICT dst, ptrdiff_t stride,
+ const uint8_t *LIBAOM_RESTRICT top_row,
+ const uint8_t *LIBAOM_RESTRICT left_column) {
+ const __m128i top_right = _mm_set1_epi16(top_row[63]);
+ const __m128i weights_lolo = LoadUnaligned16(smooth_weights + 60);
+ const __m128i weights_lohi = LoadUnaligned16(smooth_weights + 76);
+ const __m128i scale = _mm_set1_epi16(1 << SMOOTH_WEIGHT_LOG2_SCALE);
+ const __m128i weights1 = cvtepu8_epi16(weights_lolo);
+ const __m128i weights2 = cvtepu8_epi16(_mm_srli_si128(weights_lolo, 8));
+ const __m128i weights3 = cvtepu8_epi16(weights_lohi);
+ const __m128i weights4 = cvtepu8_epi16(_mm_srli_si128(weights_lohi, 8));
+ const __m128i inverted_weights1 = _mm_sub_epi16(scale, weights1);
+ const __m128i inverted_weights2 = _mm_sub_epi16(scale, weights2);
+ const __m128i inverted_weights3 = _mm_sub_epi16(scale, weights3);
+ const __m128i inverted_weights4 = _mm_sub_epi16(scale, weights4);
+ const __m128i scaled_top_right1 =
+ _mm_mullo_epi16(inverted_weights1, top_right);
+ const __m128i scaled_top_right2 =
+ _mm_mullo_epi16(inverted_weights2, top_right);
+ const __m128i scaled_top_right3 =
+ _mm_mullo_epi16(inverted_weights3, top_right);
+ const __m128i scaled_top_right4 =
+ _mm_mullo_epi16(inverted_weights4, top_right);
+ const __m128i weights_hilo = LoadUnaligned16(smooth_weights + 92);
+ const __m128i weights_hihi = LoadUnaligned16(smooth_weights + 108);
+ const __m128i weights5 = cvtepu8_epi16(weights_hilo);
+ const __m128i weights6 = cvtepu8_epi16(_mm_srli_si128(weights_hilo, 8));
+ const __m128i weights7 = cvtepu8_epi16(weights_hihi);
+ const __m128i weights8 = cvtepu8_epi16(_mm_srli_si128(weights_hihi, 8));
+ const __m128i inverted_weights5 = _mm_sub_epi16(scale, weights5);
+ const __m128i inverted_weights6 = _mm_sub_epi16(scale, weights6);
+ const __m128i inverted_weights7 = _mm_sub_epi16(scale, weights7);
+ const __m128i inverted_weights8 = _mm_sub_epi16(scale, weights8);
+ const __m128i scaled_top_right5 =
+ _mm_mullo_epi16(inverted_weights5, top_right);
+ const __m128i scaled_top_right6 =
+ _mm_mullo_epi16(inverted_weights6, top_right);
+ const __m128i scaled_top_right7 =
+ _mm_mullo_epi16(inverted_weights7, top_right);
+ const __m128i scaled_top_right8 =
+ _mm_mullo_epi16(inverted_weights8, top_right);
+ const __m128i round = _mm_set1_epi16(1 << (SMOOTH_WEIGHT_LOG2_SCALE - 1));
+ for (int left_offset = 0; left_offset < 64; left_offset += 8) {
+ const __m128i left = cvtepu8_epi16(LoadLo8(left_column + left_offset));
+ for (int y_mask = 0x01000100; y_mask < 0x0F0E0F0F; y_mask += 0x02020202) {
+ const __m128i y_select = _mm_set1_epi32(y_mask);
+ const __m128i left_y = _mm_shuffle_epi8(left, y_select);
+ write_smooth_directional_sum16(dst, left_y, left_y, weights1, weights2,
+ scaled_top_right1, scaled_top_right2,
+ round);
+ write_smooth_directional_sum16(dst + 16, left_y, left_y, weights3,
+ weights4, scaled_top_right3,
+ scaled_top_right4, round);
+ write_smooth_directional_sum16(dst + 32, left_y, left_y, weights5,
+ weights6, scaled_top_right5,
+ scaled_top_right6, round);
+ write_smooth_directional_sum16(dst + 48, left_y, left_y, weights7,
+ weights8, scaled_top_right7,
+ scaled_top_right8, round);
+ dst += stride;
+ }
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/intrapred_utils.h b/third_party/aom/aom_dsp/x86/intrapred_utils.h
new file mode 100644
index 0000000000..502574673e
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_utils.h
@@ -0,0 +1,205 @@
+/*
+ * Copyright (c) 2021, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
+
+#include <emmintrin.h> // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
+ { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
+ { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
+ { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
+ { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
+ { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
+ { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
+ { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
+ { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
+};
+
+static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
+ { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
+ { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
+ { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
+ { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
+ { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
+ { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
+ { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
+};
+
+static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
+ { -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0 },
+ { -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0 },
+ { -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0 },
+ { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
+};
+
+static INLINE void transpose4x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[0], x[1]);
+ w3 = _mm_unpackhi_epi8(x[2], x[3]);
+
+ ww0 = _mm_unpacklo_epi16(w0, w1);
+ ww1 = _mm_unpacklo_epi16(w2, w3);
+ ww2 = _mm_unpackhi_epi16(w0, w1);
+ ww3 = _mm_unpackhi_epi16(w2, w3);
+
+ w0 = _mm_unpacklo_epi32(ww0, ww1);
+ w2 = _mm_unpacklo_epi32(ww2, ww3);
+ w1 = _mm_unpackhi_epi32(ww0, ww1);
+ w3 = _mm_unpackhi_epi32(ww2, ww3);
+
+ d[0] = _mm_unpacklo_epi64(w0, w2);
+ d[1] = _mm_unpackhi_epi64(w0, w2);
+ d[2] = _mm_unpacklo_epi64(w1, w3);
+ d[3] = _mm_unpackhi_epi64(w1, w3);
+
+ d[4] = _mm_srli_si128(d[0], 8);
+ d[5] = _mm_srli_si128(d[1], 8);
+ d[6] = _mm_srli_si128(d[2], 8);
+ d[7] = _mm_srli_si128(d[3], 8);
+
+ d[8] = _mm_srli_si128(d[0], 4);
+ d[9] = _mm_srli_si128(d[1], 4);
+ d[10] = _mm_srli_si128(d[2], 4);
+ d[11] = _mm_srli_si128(d[3], 4);
+
+ d[12] = _mm_srli_si128(d[0], 12);
+ d[13] = _mm_srli_si128(d[1], 12);
+ d[14] = _mm_srli_si128(d[2], 12);
+ d[15] = _mm_srli_si128(d[3], 12);
+}
+
+static INLINE void transpose16x16_sse2(__m128i *x, __m128i *d) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(x[0], x[1]);
+ w1 = _mm_unpacklo_epi8(x[2], x[3]);
+ w2 = _mm_unpacklo_epi8(x[4], x[5]);
+ w3 = _mm_unpacklo_epi8(x[6], x[7]);
+
+ w8 = _mm_unpacklo_epi8(x[8], x[9]);
+ w9 = _mm_unpacklo_epi8(x[10], x[11]);
+ w10 = _mm_unpacklo_epi8(x[12], x[13]);
+ w11 = _mm_unpacklo_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[0] = _mm_unpacklo_epi64(w6, w14);
+ d[1] = _mm_unpackhi_epi64(w6, w14);
+ d[2] = _mm_unpacklo_epi64(w7, w15);
+ d[3] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[4] = _mm_unpacklo_epi64(w6, w14);
+ d[5] = _mm_unpackhi_epi64(w6, w14);
+ d[6] = _mm_unpacklo_epi64(w7, w15);
+ d[7] = _mm_unpackhi_epi64(w7, w15);
+
+ // upper half
+ w0 = _mm_unpackhi_epi8(x[0], x[1]);
+ w1 = _mm_unpackhi_epi8(x[2], x[3]);
+ w2 = _mm_unpackhi_epi8(x[4], x[5]);
+ w3 = _mm_unpackhi_epi8(x[6], x[7]);
+
+ w8 = _mm_unpackhi_epi8(x[8], x[9]);
+ w9 = _mm_unpackhi_epi8(x[10], x[11]);
+ w10 = _mm_unpackhi_epi8(x[12], x[13]);
+ w11 = _mm_unpackhi_epi8(x[14], x[15]);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ d[8] = _mm_unpacklo_epi64(w6, w14);
+ d[9] = _mm_unpackhi_epi64(w6, w14);
+ d[10] = _mm_unpacklo_epi64(w7, w15);
+ d[11] = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ d[12] = _mm_unpacklo_epi64(w6, w14);
+ d[13] = _mm_unpackhi_epi64(w6, w14);
+ d[14] = _mm_unpacklo_epi64(w7, w15);
+ d[15] = _mm_unpackhi_epi64(w7, w15);
+}
+
+static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
+ uint8_t *dst, ptrdiff_t pitchDst) {
+ __m128i r[16];
+ __m128i d[16];
+ for (int j = 0; j < 16; j++) {
+ r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
+ }
+ transpose16x16_sse2(r, d);
+ for (int j = 0; j < 16; j++) {
+ _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
+ }
+}
+
+static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
+ ptrdiff_t pitchDst, int width, int height) {
+ for (int j = 0; j < height; j += 16)
+ for (int i = 0; i < width; i += 16)
+ transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
+ dst + j * pitchDst + i, pitchDst);
+}
+
+#endif // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
diff --git a/third_party/aom/aom_dsp/x86/intrapred_x86.h b/third_party/aom/aom_dsp/x86/intrapred_x86.h
new file mode 100644
index 0000000000..b13f575a76
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/intrapred_x86.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+#define AOM_AOM_DSP_X86_INTRAPRED_X86_H_
+
+#include <emmintrin.h> // SSE2
+#include "aom/aom_integer.h"
+#include "config/aom_config.h"
+
+static INLINE __m128i dc_sum_16_sse2(const uint8_t *ref) {
+ __m128i x = _mm_load_si128((__m128i const *)ref);
+ const __m128i zero = _mm_setzero_si128();
+ x = _mm_sad_epu8(x, zero);
+ const __m128i high = _mm_unpackhi_epi64(x, x);
+ return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32_sse2(const uint8_t *ref) {
+ __m128i x0 = _mm_load_si128((__m128i const *)ref);
+ __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+ const __m128i zero = _mm_setzero_si128();
+ x0 = _mm_sad_epu8(x0, zero);
+ x1 = _mm_sad_epu8(x1, zero);
+ x0 = _mm_add_epi16(x0, x1);
+ const __m128i high = _mm_unpackhi_epi64(x0, x0);
+ return _mm_add_epi16(x0, high);
+}
+
+#endif // AOM_AOM_DSP_X86_INTRAPRED_X86_H_
diff --git a/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
new file mode 100644
index 0000000000..0bc841a7a4
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/inv_wht_sse2.asm
@@ -0,0 +1,107 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro REORDER_INPUTS 0
+ ; a c d b to a b c d
+ SWAP 1, 3, 2
+%endmacro
+
+%macro TRANSFORM_COLS 0
+ ; input:
+ ; m0 a
+ ; m1 b
+ ; m2 c
+ ; m3 d
+ paddw m0, m2
+ psubw m3, m1
+
+ ; wide subtract
+ punpcklwd m4, m0
+ punpcklwd m5, m3
+ psrad m4, 16
+ psrad m5, 16
+ psubd m4, m5
+ psrad m4, 1
+ packssdw m4, m4 ; e
+
+ psubw m5, m4, m1 ; b
+ psubw m4, m2 ; c
+ psubw m0, m5
+ paddw m3, m4
+ ; m0 a
+ SWAP 1, 5 ; m1 b
+ SWAP 2, 4 ; m2 c
+ ; m3 d
+%endmacro
+
+%macro TRANSPOSE_4X4 0
+ punpcklwd m0, m2
+ punpcklwd m1, m3
+ mova m2, m0
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+; transpose a 4x4 int16 matrix in xmm0 and xmm1 to the bottom half of xmm0-xmm3
+%macro TRANSPOSE_4X4_WIDE 0
+ mova m3, m0
+ punpcklwd m0, m1
+ punpckhwd m3, m1
+ mova m2, m0
+ punpcklwd m0, m3
+ punpckhwd m2, m3
+ pshufd m1, m0, 0x0e
+ pshufd m3, m2, 0x0e
+%endmacro
+
+%macro ADD_STORE_4P_2X 5 ; src1, src2, tmp1, tmp2, zero
+ movd m%3, [outputq]
+ movd m%4, [outputq + strideq]
+ punpcklbw m%3, m%5
+ punpcklbw m%4, m%5
+ paddw m%1, m%3
+ paddw m%2, m%4
+ packuswb m%1, m%5
+ packuswb m%2, m%5
+ movd [outputq], m%1
+ movd [outputq + strideq], m%2
+%endmacro
+
+INIT_XMM sse2
+cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride
+ mova m0, [inputq + 0]
+ packssdw m0, [inputq + 16]
+ mova m1, [inputq + 32]
+ packssdw m1, [inputq + 48]
+ psraw m0, 2
+ psraw m1, 2
+
+ TRANSPOSE_4X4_WIDE
+ REORDER_INPUTS
+ TRANSFORM_COLS
+ TRANSPOSE_4X4
+ REORDER_INPUTS
+ TRANSFORM_COLS
+
+ pxor m4, m4
+ ADD_STORE_4P_2X 0, 1, 5, 6, 4
+ lea outputq, [outputq + 2 * strideq]
+ ADD_STORE_4P_2X 2, 3, 5, 6, 4
+
+ RET
diff --git a/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
new file mode 100644
index 0000000000..16d2f4be7f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_sad_sse2.c
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+static unsigned int sad4xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 4);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; i += 4) {
+ __m128i x0 = xx_loadl_32(a + 0 * a_stride);
+ __m128i x1 = xx_loadl_32(a + 1 * a_stride);
+ __m128i x2 = xx_loadl_32(a + 2 * a_stride);
+ __m128i x3 = xx_loadl_32(a + 3 * a_stride);
+ __m128i x_lo = _mm_unpacklo_epi32(x0, x1);
+ __m128i x_hi = _mm_unpacklo_epi32(x2, x3);
+
+ __m128i x = _mm_unpacklo_epi64(x_lo, x_hi);
+
+ x0 = xx_loadl_32(b + 0 * b_stride);
+ x1 = xx_loadl_32(b + 1 * b_stride);
+ x2 = xx_loadl_32(b + 2 * b_stride);
+ x3 = xx_loadl_32(b + 3 * b_stride);
+ x_lo = _mm_unpacklo_epi32(x0, x1);
+ x_hi = _mm_unpacklo_epi32(x2, x3);
+
+ __m128i y = _mm_unpacklo_epi64(x_lo, x_hi);
+
+ __m128i sad4x4 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad4x4);
+
+ a += 4 * a_stride;
+ b += 4 * b_stride;
+ }
+
+ // At this point, we have two 32-bit partial SADs at bit[0:31] and [64:95].
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad8xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 8);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; i += 2) {
+ __m128i x0 = xx_loadl_64(a + 0 * a_stride);
+ __m128i x1 = xx_loadl_64(a + 1 * a_stride);
+
+ __m128i x = _mm_unpacklo_epi64(x0, x1);
+
+ x0 = xx_loadl_64(b + 0 * b_stride);
+ x1 = xx_loadl_64(b + 1 * b_stride);
+
+ __m128i y = _mm_unpacklo_epi64(x0, x1);
+
+ __m128i sad8x2 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad8x2);
+
+ a += 2 * a_stride;
+ b += 2 * b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad16xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i;
+ assert(width == 16);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ __m128i x = xx_loadu_128(a);
+ __m128i y = xx_loadu_128(b);
+
+ __m128i sad16x1 = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad16x1);
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad32xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 32);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 2; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad32_half = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad32_half);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad64xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 64);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 4; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad64_quarter = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad64_quarter);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+static unsigned int sad128xh_sse2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, int width,
+ int height) {
+ int i, j;
+ assert(width == 128);
+ (void)width;
+
+ __m128i sad = _mm_setzero_si128();
+ for (i = 0; i < height; ++i) {
+ for (j = 0; j < 8; ++j) {
+ __m128i x = xx_loadu_128(a + j * 16);
+ __m128i y = xx_loadu_128(b + j * 16);
+
+ __m128i sad64_quarter = _mm_sad_epu8(x, y);
+ sad = _mm_add_epi32(sad, sad64_quarter);
+ }
+
+ a += a_stride;
+ b += b_stride;
+ }
+
+ const unsigned int res =
+ (unsigned int)(_mm_cvtsi128_si32(sad) +
+ _mm_cvtsi128_si32(_mm_srli_si128(sad, 8)));
+
+ return res;
+}
+
+#define DIST_WTD_SADMXN_SSE2(m, n) \
+ unsigned int aom_dist_wtd_sad##m##x##n##_avg_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint8_t comp_pred[m * n]; \
+ aom_dist_wtd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride, \
+ jcp_param); \
+ return sad##m##xh_sse2(src, src_stride, comp_pred, m, m, n); \
+ }
+
+DIST_WTD_SADMXN_SSE2(128, 128)
+DIST_WTD_SADMXN_SSE2(128, 64)
+DIST_WTD_SADMXN_SSE2(64, 128)
+DIST_WTD_SADMXN_SSE2(64, 64)
+DIST_WTD_SADMXN_SSE2(64, 32)
+DIST_WTD_SADMXN_SSE2(32, 64)
+DIST_WTD_SADMXN_SSE2(32, 32)
+DIST_WTD_SADMXN_SSE2(32, 16)
+DIST_WTD_SADMXN_SSE2(16, 32)
+DIST_WTD_SADMXN_SSE2(16, 16)
+DIST_WTD_SADMXN_SSE2(16, 8)
+DIST_WTD_SADMXN_SSE2(8, 16)
+DIST_WTD_SADMXN_SSE2(8, 8)
+DIST_WTD_SADMXN_SSE2(8, 4)
+DIST_WTD_SADMXN_SSE2(4, 8)
+DIST_WTD_SADMXN_SSE2(4, 4)
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SADMXN_SSE2(4, 16)
+DIST_WTD_SADMXN_SSE2(16, 4)
+DIST_WTD_SADMXN_SSE2(8, 32)
+DIST_WTD_SADMXN_SSE2(32, 8)
+DIST_WTD_SADMXN_SSE2(16, 64)
+DIST_WTD_SADMXN_SSE2(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
new file mode 100644
index 0000000000..dd798ca54a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/jnt_variance_ssse3.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+static INLINE void compute_dist_wtd_avg(__m128i *p0, __m128i *p1,
+ const __m128i *w, const __m128i *r,
+ void *const result) {
+ __m128i p_lo = _mm_unpacklo_epi8(*p0, *p1);
+ __m128i mult_lo = _mm_maddubs_epi16(p_lo, *w);
+ __m128i round_lo = _mm_add_epi16(mult_lo, *r);
+ __m128i shift_lo = _mm_srai_epi16(round_lo, DIST_PRECISION_BITS);
+
+ __m128i p_hi = _mm_unpackhi_epi8(*p0, *p1);
+ __m128i mult_hi = _mm_maddubs_epi16(p_hi, *w);
+ __m128i round_hi = _mm_add_epi16(mult_hi, *r);
+ __m128i shift_hi = _mm_srai_epi16(round_hi, DIST_PRECISION_BITS);
+
+ xx_storeu_128(result, _mm_packus_epi16(shift_lo, shift_hi));
+}
+
+void aom_dist_wtd_comp_avg_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride,
+ const DIST_WTD_COMP_PARAMS *jcp_param) {
+ int i;
+ const int8_t w0 = (int8_t)jcp_param->fwd_offset;
+ const int8_t w1 = (int8_t)jcp_param->bck_offset;
+ const __m128i w = _mm_set_epi8(w1, w0, w1, w0, w1, w0, w1, w0, w1, w0, w1, w0,
+ w1, w0, w1, w0);
+ const int16_t round = (int16_t)((1 << DIST_PRECISION_BITS) >> 1);
+ const __m128i r = _mm_set1_epi16(round);
+
+ if (width >= 16) {
+ // Read 16 pixels one row at a time
+ assert(!(width & 15));
+ for (i = 0; i < height; ++i) {
+ int j;
+ for (j = 0; j < width; j += 16) {
+ __m128i p0 = xx_loadu_128(ref);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 16;
+ }
+ ref += ref_stride - width;
+ }
+ } else if (width >= 8) {
+ // Read 8 pixels two row at a time
+ assert(!(width & 7));
+ assert(!(width & 1));
+ for (i = 0; i < height; i += 2) {
+ __m128i p0_0 = xx_loadl_64(ref + 0 * ref_stride);
+ __m128i p0_1 = xx_loadl_64(ref + 1 * ref_stride);
+ __m128i p0 = _mm_unpacklo_epi64(p0_0, p0_1);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 2 * ref_stride;
+ }
+ } else {
+ // Read 4 pixels four row at a time
+ assert(!(width & 3));
+ assert(!(height & 3));
+ for (i = 0; i < height; i += 4) {
+ const int8_t *row0 = (const int8_t *)ref + 0 * ref_stride;
+ const int8_t *row1 = (const int8_t *)ref + 1 * ref_stride;
+ const int8_t *row2 = (const int8_t *)ref + 2 * ref_stride;
+ const int8_t *row3 = (const int8_t *)ref + 3 * ref_stride;
+
+ __m128i p0 =
+ _mm_setr_epi8(row0[0], row0[1], row0[2], row0[3], row1[0], row1[1],
+ row1[2], row1[3], row2[0], row2[1], row2[2], row2[3],
+ row3[0], row3[1], row3[2], row3[3]);
+ __m128i p1 = xx_loadu_128(pred);
+
+ compute_dist_wtd_avg(&p0, &p1, &w, &r, comp_pred);
+
+ comp_pred += 16;
+ pred += 16;
+ ref += 4 * ref_stride;
+ }
+ }
+}
+
+#define DIST_WTD_SUBPIX_AVG_VAR(W, H) \
+ uint32_t aom_dist_wtd_sub_pixel_avg_variance##W##x##H##_ssse3( \
+ const uint8_t *a, int a_stride, int xoffset, int yoffset, \
+ const uint8_t *b, int b_stride, uint32_t *sse, \
+ const uint8_t *second_pred, const DIST_WTD_COMP_PARAMS *jcp_param) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ a, fdata3, a_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ aom_dist_wtd_comp_avg_pred_ssse3(temp3, second_pred, W, H, temp2, W, \
+ jcp_param); \
+ \
+ return aom_variance##W##x##H(temp3, W, b, b_stride, sse); \
+ }
+
+DIST_WTD_SUBPIX_AVG_VAR(128, 128)
+DIST_WTD_SUBPIX_AVG_VAR(128, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 128)
+DIST_WTD_SUBPIX_AVG_VAR(64, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 64)
+DIST_WTD_SUBPIX_AVG_VAR(32, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 32)
+DIST_WTD_SUBPIX_AVG_VAR(16, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 16)
+DIST_WTD_SUBPIX_AVG_VAR(8, 8)
+DIST_WTD_SUBPIX_AVG_VAR(8, 4)
+DIST_WTD_SUBPIX_AVG_VAR(4, 8)
+DIST_WTD_SUBPIX_AVG_VAR(4, 4)
+
+#if !CONFIG_REALTIME_ONLY
+DIST_WTD_SUBPIX_AVG_VAR(4, 16)
+DIST_WTD_SUBPIX_AVG_VAR(16, 4)
+DIST_WTD_SUBPIX_AVG_VAR(8, 32)
+DIST_WTD_SUBPIX_AVG_VAR(32, 8)
+DIST_WTD_SUBPIX_AVG_VAR(16, 64)
+DIST_WTD_SUBPIX_AVG_VAR(64, 16)
+#endif
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_avx2.c b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
new file mode 100644
index 0000000000..6e77742e3c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_avx2.c
@@ -0,0 +1,1016 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> /* AVX2 */
+
+#include "config/aom_dsp_rtcd.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, filt_loopfilter_avx2[32]) = {
+ 0, 128, 1, 128, 2, 128, 3, 128, 4, 128, 5, 128, 6, 128, 7, 128,
+ 8, 128, 9, 128, 10, 128, 11, 128, 12, 128, 13, 128, 14, 128, 15, 128
+};
+
+void aom_lpf_horizontal_6_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m256i p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i mask, flat;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+
+ {
+ __m128i work;
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i hev;
+
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat_p1, flat_p0, flat_q0, flat_q1;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixetFilter, add, res;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+
+ pixetFilter = _mm256_slli_epi16(
+ _mm256_add_epi16(p256_2, _mm256_add_epi16(p256_1, p256_0)), 1);
+ pixetFilter =
+ _mm256_add_epi16(pixetFilter, _mm256_add_epi16(p256_2, q256_0));
+ pixetFilter = _mm256_add_epi16(four, pixetFilter);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_1, p256_2),
+ _mm256_sub_epi16(q256_0, p256_2));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_2),
+ _mm256_sub_epi16(q256_1, p256_1));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ add = _mm256_add_epi16(_mm256_sub_epi16(q256_2, p256_1),
+ _mm256_sub_epi16(q256_2, p256_0));
+ pixetFilter = _mm256_add_epi16(pixetFilter, add);
+ res = _mm256_srli_epi16(pixetFilter, 3);
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res, res), 168));
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_8_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m256i p256_3, q256_3, p256_2, q256_2, p256_1, q256_1, p256_0, q256_0;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i mask, flat;
+
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+ p3 = _mm256_castsi256_si128(p256_3);
+ p2 = _mm256_castsi256_si128(p256_2);
+ p1 = _mm256_castsi256_si128(p256_1);
+ p0 = _mm256_castsi256_si128(p256_0);
+ q0 = _mm256_castsi256_si128(q256_0);
+ q1 = _mm256_castsi256_si128(q256_1);
+ q2 = _mm256_castsi256_si128(q256_2);
+ q3 = _mm256_castsi256_si128(q256_3);
+
+ {
+ __m128i work;
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i hev;
+
+ hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+ __m128i flat_p2, flat_p1, flat_p0, flat_q0, flat_q1, flat_q2;
+
+ filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ __m128i work;
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0), _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p3, p0), _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0), _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i four = _mm256_set1_epi16(4);
+ __m256i pixetFilter_p2p1p0, p2p1p0, q2q1q0, pixetFilter_q2q1q0, sum_p,
+ sum_q, res_p, res_q;
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+ p2p1p0 = _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ q2q1q0 = _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+ pixetFilter_p2p1p0 =
+ _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+ pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+ flat_p0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p0 = _mm_andnot_si128(flat, ps0);
+ flat_p0 = _mm_and_si128(flat, flat_p0);
+ p0 = _mm_or_si128(flat_p0, p0);
+
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+ flat_q0 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q0 = _mm_andnot_si128(flat, qs0);
+ flat_q0 = _mm_and_si128(flat, flat_q0);
+ q0 = _mm_or_si128(flat_q0, q0);
+
+ sum_p = _mm256_sub_epi16(p256_3, q256_2);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+ flat_p1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p1 = _mm_andnot_si128(flat, ps1);
+ flat_p1 = _mm_and_si128(flat, flat_p1);
+ p1 = _mm_or_si128(flat_p1, p1);
+
+ sum_q = _mm256_sub_epi16(q256_3, p256_2);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+ flat_q1 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q1 = _mm_andnot_si128(flat, qs1);
+ flat_q1 = _mm_and_si128(flat, flat_q1);
+ q1 = _mm_or_si128(flat_q1, q1);
+
+ sum_p = _mm256_sub_epi16(p256_3, q256_1);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+ flat_p2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_p), 168));
+ p2 = _mm_andnot_si128(flat, p2);
+ flat_p2 = _mm_and_si128(flat, flat_p2);
+ p2 = _mm_or_si128(flat_p2, p2);
+
+ sum_q = _mm256_sub_epi16(q256_3, p256_1);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+ flat_q2 = _mm256_castsi256_si128(
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_q, res_q), 168));
+ q2 = _mm_andnot_si128(flat, q2);
+ flat_q2 = _mm_and_si128(flat, flat_q2);
+ q2 = _mm_or_si128(flat_q2, q2);
+
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+static INLINE void trans_store_16x16_lpf_vert14(unsigned char *in0, int in_p,
+ unsigned char *out, int out_p,
+ int is_store_avx2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)in0);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)(in0 + in_p * 1));
+ const __m128i x2 = _mm_loadu_si128((__m128i *)(in0 + in_p * 2));
+ const __m128i x3 = _mm_loadu_si128((__m128i *)(in0 + in_p * 3));
+ const __m128i x4 = _mm_loadu_si128((__m128i *)(in0 + in_p * 4));
+ const __m128i x5 = _mm_loadu_si128((__m128i *)(in0 + in_p * 5));
+ const __m128i x6 = _mm_loadu_si128((__m128i *)(in0 + in_p * 6));
+ const __m128i x7 = _mm_loadu_si128((__m128i *)(in0 + in_p * 7));
+
+ const __m256i y0 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x0), _mm_loadu_si128((__m128i *)(in0 + in_p * 8)),
+ 0x1);
+ const __m256i y1 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x1), _mm_loadu_si128((__m128i *)(in0 + in_p * 9)),
+ 0x1);
+ const __m256i y2 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x2), _mm_loadu_si128((__m128i *)(in0 + in_p * 10)),
+ 0x1);
+ const __m256i y3 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x3), _mm_loadu_si128((__m128i *)(in0 + in_p * 11)),
+ 0x1);
+ const __m256i y4 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x4), _mm_loadu_si128((__m128i *)(in0 + in_p * 12)),
+ 0x1);
+ const __m256i y5 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x5), _mm_loadu_si128((__m128i *)(in0 + in_p * 13)),
+ 0x1);
+ const __m256i y6 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x6), _mm_loadu_si128((__m128i *)(in0 + in_p * 14)),
+ 0x1);
+ const __m256i y7 = _mm256_insertf128_si256(
+ _mm256_castsi128_si256(x7), _mm_loadu_si128((__m128i *)(in0 + in_p * 15)),
+ 0x1);
+
+ const __m256i y_s00 = _mm256_unpacklo_epi8(y0, y1);
+ const __m256i y_s01 = _mm256_unpackhi_epi8(y0, y1);
+ const __m256i y_s02 = _mm256_unpacklo_epi8(y2, y3);
+ const __m256i y_s03 = _mm256_unpackhi_epi8(y2, y3);
+ const __m256i y_s04 = _mm256_unpacklo_epi8(y4, y5);
+ const __m256i y_s05 = _mm256_unpackhi_epi8(y4, y5);
+ const __m256i y_s06 = _mm256_unpacklo_epi8(y6, y7);
+ const __m256i y_s07 = _mm256_unpackhi_epi8(y6, y7);
+
+ const __m256i y_s10 = _mm256_unpacklo_epi16(y_s00, y_s02);
+ const __m256i y_s11 = _mm256_unpackhi_epi16(y_s00, y_s02);
+ const __m256i y_s12 = _mm256_unpacklo_epi16(y_s01, y_s03);
+ const __m256i y_s13 = _mm256_unpackhi_epi16(y_s01, y_s03);
+ const __m256i y_s14 = _mm256_unpacklo_epi16(y_s04, y_s06);
+ const __m256i y_s15 = _mm256_unpackhi_epi16(y_s04, y_s06);
+ const __m256i y_s16 = _mm256_unpacklo_epi16(y_s05, y_s07);
+ const __m256i y_s17 = _mm256_unpackhi_epi16(y_s05, y_s07);
+
+ const __m256i y_s20 = _mm256_unpacklo_epi32(y_s10, y_s14);
+ const __m256i y_s21 = _mm256_unpackhi_epi32(y_s10, y_s14);
+ const __m256i y_s22 = _mm256_unpacklo_epi32(y_s11, y_s15);
+ const __m256i y_s23 = _mm256_unpackhi_epi32(y_s11, y_s15);
+ const __m256i y_s24 = _mm256_unpacklo_epi32(y_s12, y_s16);
+ const __m256i y_s25 = _mm256_unpackhi_epi32(y_s12, y_s16);
+ const __m256i y_s26 = _mm256_unpacklo_epi32(y_s13, y_s17);
+ const __m256i y_s27 = _mm256_unpackhi_epi32(y_s13, y_s17);
+
+ const __m256i row_s01 = _mm256_permute4x64_epi64(y_s20, 0xd8);
+ const __m256i row_s23 = _mm256_permute4x64_epi64(y_s21, 0xd8);
+ const __m256i row_s45 = _mm256_permute4x64_epi64(y_s22, 0xd8);
+ const __m256i row_s67 = _mm256_permute4x64_epi64(y_s23, 0xd8);
+ const __m256i row_s89 = _mm256_permute4x64_epi64(y_s24, 0xd8);
+ const __m256i row_s1011 = _mm256_permute4x64_epi64(y_s25, 0xd8);
+ const __m256i row_s1213 = _mm256_permute4x64_epi64(y_s26, 0xd8);
+ const __m256i row_s1415 = _mm256_permute4x64_epi64(y_s27, 0xd8);
+
+ if (is_store_avx2) {
+ _mm256_storeu_si256((__m256i *)(out), row_s01);
+ _mm256_storeu_si256((__m256i *)(out + (2 * out_p)), row_s23);
+ _mm256_storeu_si256((__m256i *)(out + (4 * out_p)), row_s45);
+ _mm256_storeu_si256((__m256i *)(out + (6 * out_p)), row_s67);
+ _mm256_storeu_si256((__m256i *)(out + (8 * out_p)), row_s89);
+ _mm256_storeu_si256((__m256i *)(out + (10 * out_p)), row_s1011);
+ _mm256_storeu_si256((__m256i *)(out + (12 * out_p)), row_s1213);
+ _mm256_storeu_si256((__m256i *)(out + (14 * out_p)), row_s1415);
+ } else {
+ _mm_storeu_si128((__m128i *)(out), _mm256_castsi256_si128(row_s01));
+ _mm_storeu_si128((__m128i *)(out + (2 * out_p)),
+ _mm256_castsi256_si128(row_s23));
+ _mm_storeu_si128((__m128i *)(out + (4 * out_p)),
+ _mm256_castsi256_si128(row_s45));
+ _mm_storeu_si128((__m128i *)(out + (6 * out_p)),
+ _mm256_castsi256_si128(row_s67));
+ _mm_storeu_si128((__m128i *)(out + (8 * out_p)),
+ _mm256_castsi256_si128(row_s89));
+ _mm_storeu_si128((__m128i *)(out + (10 * out_p)),
+ _mm256_castsi256_si128(row_s1011));
+ _mm_storeu_si128((__m128i *)(out + (12 * out_p)),
+ _mm256_castsi256_si128(row_s1213));
+ _mm_storeu_si128((__m128i *)(out + (14 * out_p)),
+ _mm256_castsi256_si128(row_s1415));
+ _mm_storeu_si128((__m128i *)(out + (1 * out_p)),
+ _mm256_extracti128_si256(row_s01, 1));
+ _mm_storeu_si128((__m128i *)(out + (3 * out_p)),
+ _mm256_extracti128_si256(row_s23, 1));
+ _mm_storeu_si128((__m128i *)(out + (5 * out_p)),
+ _mm256_extracti128_si256(row_s45, 1));
+ _mm_storeu_si128((__m128i *)(out + (7 * out_p)),
+ _mm256_extracti128_si256(row_s67, 1));
+ _mm_storeu_si128((__m128i *)(out + (9 * out_p)),
+ _mm256_extracti128_si256(row_s89, 1));
+ _mm_storeu_si128((__m128i *)(out + (11 * out_p)),
+ _mm256_extracti128_si256(row_s1011, 1));
+ _mm_storeu_si128((__m128i *)(out + (13 * out_p)),
+ _mm256_extracti128_si256(row_s1213, 1));
+ _mm_storeu_si128((__m128i *)(out + (15 * out_p)),
+ _mm256_extracti128_si256(row_s1415, 1));
+ }
+}
+
+void aom_lpf_horizontal_14_quad_avx2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ __m128i mask, flat;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+
+ __m256i p256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 4 * p)));
+ __m256i p256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 3 * p)));
+ __m256i p256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 2 * p)));
+ __m256i p256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 1 * p)));
+ __m256i q256_0 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s - 0 * p)));
+ __m256i q256_1 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 1 * p)));
+ __m256i q256_2 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 2 * p)));
+ __m256i q256_3 =
+ _mm256_castpd_si256(_mm256_broadcast_pd((__m128d const *)(s + 3 * p)));
+
+ __m128i p3 = _mm256_castsi256_si128(p256_3);
+ __m128i p2 = _mm256_castsi256_si128(p256_2);
+ __m128i p1 = _mm256_castsi256_si128(p256_1);
+ __m128i p0 = _mm256_castsi256_si128(p256_0);
+ __m128i q0 = _mm256_castsi256_si128(q256_0);
+ __m128i q1 = _mm256_castsi256_si128(q256_1);
+ __m128i q2 = _mm256_castsi256_si128(q256_2);
+ __m128i q3 = _mm256_castsi256_si128(q256_3);
+
+ {
+ const __m128i limit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit0[0]));
+ const __m128i blimit_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit0[0]));
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i abs_p1p0 =
+ _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1));
+ const __m128i abs_q1q0 =
+ _mm_or_si128(_mm_subs_epu8(q1, q0), _mm_subs_epu8(q0, q1));
+ __m128i abs_p0q0 =
+ _mm_or_si128(_mm_subs_epu8(p0, q0), _mm_subs_epu8(q0, p0));
+ __m128i abs_p1q1 =
+ _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1));
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(flat, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ __m128i work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(p2, p1), _mm_subs_epu8(p1, p2)),
+ _mm_or_si128(_mm_subs_epu8(p3, p2), _mm_subs_epu8(p2, p3)));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(
+ _mm_or_si128(_mm_subs_epu8(q2, q1), _mm_subs_epu8(q1, q2)),
+ _mm_or_si128(_mm_subs_epu8(q3, q2), _mm_subs_epu8(q2, q3)));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // loop filter
+ {
+ const __m128i thresh_v =
+ _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh0[0]));
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t4 = _mm_add_epi8(one, t3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t7f = _mm_sub_epi8(t80, one);
+
+ __m128i hev = _mm_subs_epu8(flat, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
+
+ __m128i filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+ __m128i work_a = _mm_subs_epi8(qs0, ps0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+
+ __m128i filter1 = _mm_adds_epi8(filt, t4);
+ __m128i filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, one);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+ qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+
+ // Derive flat
+ __m256i p0q0256 = _mm256_blend_epi32(p256_0, q256_0, 0xf0);
+ __m256i p2q2256 = _mm256_blend_epi32(p256_2, q256_2, 0xf0);
+ __m256i p3q3256 = _mm256_blend_epi32(p256_3, q256_3, 0xf0);
+ const __m256i ps0qs0256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(ps0), qs0, 0x1);
+ const __m256i ps1qs1256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(ps1), qs1, 0x1);
+ const __m256i work01 = _mm256_or_si256(_mm256_subs_epu8(p2q2256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p2q2256));
+ const __m256i work02 = _mm256_or_si256(_mm256_subs_epu8(p3q3256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p3q3256));
+ const __m256i max0_256 = _mm256_max_epu8(work01, work02);
+ const __m128i max1_256 =
+ _mm_max_epu8(_mm256_castsi256_si128(max0_256),
+ _mm256_extractf128_si256(max0_256, 1));
+ flat = _mm_max_epu8(max1_256, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m256i flat256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(flat), flat, 0x1);
+ const __m256i eight = _mm256_set1_epi16(8);
+ const __m256i four = _mm256_set1_epi16(4);
+
+ __m256i p256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 5 * p)));
+ __m256i q256_4 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 4 * p)));
+ __m256i p256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 6 * p)));
+ __m256i q256_5 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 5 * p)));
+ __m256i p256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s - 7 * p)));
+ __m256i q256_6 = _mm256_castpd_si256(
+ _mm256_broadcast_pd((__m128d const *)(s + 6 * p)));
+
+ // Derive flat2
+ __m256i p4q4256 = _mm256_blend_epi32(p256_4, q256_4, 0xf0);
+ __m256i p5q5256 = _mm256_blend_epi32(p256_5, q256_5, 0xf0);
+ const __m256i p6q6256 = _mm256_blend_epi32(p256_6, q256_6, 0xf0);
+ const __m256i work1 = _mm256_or_si256(_mm256_subs_epu8(p4q4256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p4q4256));
+ const __m256i work2 = _mm256_or_si256(_mm256_subs_epu8(p5q5256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p5q5256));
+ const __m256i work3 = _mm256_or_si256(_mm256_subs_epu8(p6q6256, p0q0256),
+ _mm256_subs_epu8(p0q0256, p6q6256));
+ __m256i flat2_256 = _mm256_max_epu8(work1, work2);
+ flat2_256 = _mm256_max_epu8(flat2_256, work3);
+ __m128i flat2 = _mm_max_epu8(_mm256_castsi256_si128(flat2_256),
+ _mm256_extractf128_si256(flat2_256, 1));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+
+ const __m256i filter =
+ _mm256_load_si256((__m256i const *)filt_loopfilter_avx2);
+
+ p256_3 = _mm256_shuffle_epi8(p256_3, filter);
+ p256_2 = _mm256_shuffle_epi8(p256_2, filter);
+ p256_1 = _mm256_shuffle_epi8(p256_1, filter);
+ p256_0 = _mm256_shuffle_epi8(p256_0, filter);
+ q256_0 = _mm256_shuffle_epi8(q256_0, filter);
+ q256_1 = _mm256_shuffle_epi8(q256_1, filter);
+ q256_2 = _mm256_shuffle_epi8(q256_2, filter);
+ q256_3 = _mm256_shuffle_epi8(q256_3, filter);
+
+ const __m256i p2p1p0 =
+ _mm256_add_epi16(p256_0, _mm256_add_epi16(p256_2, p256_1));
+ const __m256i q2q1q0 =
+ _mm256_add_epi16(q256_0, _mm256_add_epi16(q256_2, q256_1));
+
+ __m256i pixetFilter_p2p1p0 =
+ _mm256_add_epi16(four, _mm256_add_epi16(p2p1p0, q2q1q0));
+ __m256i pixetFilter_q2q1q0 = pixetFilter_p2p1p0;
+
+ // Derive p0 and q0
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, p256_3);
+ __m256i res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_0), 3);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, q256_3);
+ __m256i res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_0), 3);
+ __m256i flat_p0q0 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p0q0256 = _mm256_andnot_si256(flat256, ps0qs0256);
+ flat_p0q0 = _mm256_and_si256(flat256, flat_p0q0);
+ p0q0256 = _mm256_or_si256(flat_p0q0, p0q0256);
+ p0 = _mm256_castsi256_si128(p0q0256);
+ q0 = _mm256_extractf128_si256(p0q0256, 1);
+
+ // Derive p1 and q1
+ __m256i sum_p = _mm256_sub_epi16(p256_3, q256_2);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_1), 3);
+ __m256i sum_q = _mm256_sub_epi16(q256_3, p256_2);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_1), 3);
+ __m256i flat_p1q1 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ __m256i p1q1256 = _mm256_andnot_si256(flat256, ps1qs1256);
+ flat_p1q1 = _mm256_and_si256(flat256, flat_p1q1);
+ p1q1256 = _mm256_or_si256(flat_p1q1, p1q1256);
+ p1 = _mm256_castsi256_si128(p1q1256);
+ q1 = _mm256_extractf128_si256(p1q1256, 1);
+
+ // Derive p2 and q2
+ sum_p = _mm256_sub_epi16(p256_3, q256_1);
+ pixetFilter_p2p1p0 = _mm256_add_epi16(pixetFilter_p2p1p0, sum_p);
+ res_p =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_p2p1p0, p256_2), 3);
+ sum_q = _mm256_sub_epi16(q256_3, p256_1);
+ pixetFilter_q2q1q0 = _mm256_add_epi16(pixetFilter_q2q1q0, sum_q);
+ res_q =
+ _mm256_srli_epi16(_mm256_add_epi16(pixetFilter_q2q1q0, q256_2), 3);
+ __m256i flat_p2q2 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p2q2256 = _mm256_andnot_si256(flat256, p2q2256);
+ flat_p2q2 = _mm256_and_si256(flat256, flat_p2q2);
+ p2q2256 = _mm256_or_si256(flat_p2q2, p2q2256);
+ p2 = _mm256_castsi256_si128(p2q2256);
+ q2 = _mm256_extractf128_si256(p2q2256, 1);
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_256 =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(flat2), flat2, 0x1);
+ p256_6 = _mm256_shuffle_epi8(p256_6, filter);
+ p256_5 = _mm256_shuffle_epi8(p256_5, filter);
+ p256_4 = _mm256_shuffle_epi8(p256_4, filter);
+ q256_4 = _mm256_shuffle_epi8(q256_4, filter);
+ q256_5 = _mm256_shuffle_epi8(q256_5, filter);
+ q256_6 = _mm256_shuffle_epi8(q256_6, filter);
+
+ __m256i pixelFilter_p =
+ _mm256_add_epi16(p256_5, _mm256_add_epi16(p256_4, p256_3));
+ __m256i pixelFilter_q =
+ _mm256_add_epi16(q256_5, _mm256_add_epi16(q256_4, q256_3));
+
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p2p1p0);
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q2q1q0);
+
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, p256_0);
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, q256_0);
+ pixelFilter_p = _mm256_add_epi16(
+ eight, _mm256_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixelFilter_q = pixelFilter_p;
+
+ // Derive p0 and q0
+ pixelFilter_p =
+ _mm256_add_epi16(_mm256_add_epi16(p256_6, p256_1), pixelFilter_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ pixelFilter_q =
+ _mm256_add_epi16(_mm256_add_epi16(q256_6, q256_1), pixelFilter_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p0q0 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p0q0256 = _mm256_andnot_si256(flat2_256, p0q0256);
+ flat2_p0q0 = _mm256_and_si256(flat2_256, flat2_p0q0);
+ p0q0256 = _mm256_or_si256(flat2_p0q0, p0q0256);
+
+ p0 = _mm256_castsi256_si128(p0q0256);
+ q0 = _mm256_extractf128_si256(p0q0256, 1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+
+ // Derive p1 and q1
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_5),
+ _mm256_sub_epi16(p256_2, q256_0));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_5),
+ _mm256_sub_epi16(q256_2, p256_0));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p1q1 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p1q1256 = _mm256_andnot_si256(flat2_256, p1q1256);
+ flat2_p1q1 = _mm256_and_si256(flat2_256, flat2_p1q1);
+ p1q1256 = _mm256_or_si256(flat2_p1q1, p1q1256);
+ p1 = _mm256_castsi256_si128(p1q1256);
+ q1 = _mm256_extractf128_si256(p1q1256, 1);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+
+ // Derive p2 and q2
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_4),
+ _mm256_sub_epi16(p256_3, p256_0));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_4),
+ _mm256_sub_epi16(q256_3, q256_0));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p2q2 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p2q2256 = _mm256_andnot_si256(flat2_256, p2q2256);
+ flat2_p2q2 = _mm256_and_si256(flat2_256, flat2_p2q2);
+ p2q2256 = _mm256_or_si256(flat2_p2q2, p2q2256);
+ p2 = _mm256_castsi256_si128(p2q2256);
+ q2 = _mm256_extractf128_si256(p2q2256, 1);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+
+ // Derive p3 and q3
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_3),
+ _mm256_sub_epi16(p256_4, p256_1));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_3),
+ _mm256_sub_epi16(q256_4, q256_1));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p3q3 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p3q3256 = _mm256_andnot_si256(flat2_256, p3q3256);
+ flat2_p3q3 = _mm256_and_si256(flat2_256, flat2_p3q3);
+ p3q3256 = _mm256_or_si256(flat2_p3q3, p3q3256);
+ p3 = _mm256_castsi256_si128(p3q3256);
+ q3 = _mm256_extractf128_si256(p3q3256, 1);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ // Derive p4 and q4
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_2),
+ _mm256_sub_epi16(p256_5, p256_2));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_2),
+ _mm256_sub_epi16(q256_5, q256_2));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p4q4 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p4q4256 = _mm256_andnot_si256(flat2_256, p4q4256);
+ flat2_p4q4 = _mm256_and_si256(flat2_256, flat2_p4q4);
+ p4q4256 = _mm256_or_si256(flat2_p4q4, p4q4256);
+ _mm_storeu_si128((__m128i *)(s - 5 * p),
+ _mm256_castsi256_si128(p4q4256));
+ _mm_storeu_si128((__m128i *)(s + 4 * p),
+ _mm256_extractf128_si256(p4q4256, 1));
+
+ // Derive p5 and q5
+ sum_p = _mm256_add_epi16(_mm256_sub_epi16(p256_6, q256_1),
+ _mm256_sub_epi16(p256_6, p256_3));
+ pixelFilter_p = _mm256_add_epi16(pixelFilter_p, sum_p);
+ res_p = _mm256_srli_epi16(pixelFilter_p, 4);
+ sum_q = _mm256_add_epi16(_mm256_sub_epi16(q256_6, p256_1),
+ _mm256_sub_epi16(q256_6, q256_3));
+ pixelFilter_q = _mm256_add_epi16(pixelFilter_q, sum_q);
+ res_q = _mm256_srli_epi16(pixelFilter_q, 4);
+ __m256i flat2_p5q5 =
+ _mm256_permute4x64_epi64(_mm256_packus_epi16(res_p, res_q), 0xd8);
+ p5q5256 = _mm256_andnot_si256(flat2_256, p5q5256);
+ flat2_p5q5 = _mm256_and_si256(flat2_256, flat2_p5q5);
+ p5q5256 = _mm256_or_si256(flat2_p5q5, p5q5256);
+ _mm_storeu_si128((__m128i *)(s - 6 * p),
+ _mm256_castsi256_si128(p5q5256));
+ _mm_storeu_si128((__m128i *)(s + 5 * p),
+ _mm256_extractf128_si256(p5q5256, 1));
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+ }
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), ps1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), ps0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), qs0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), qs1);
+ }
+ }
+}
+
+void aom_lpf_vertical_14_quad_avx2(unsigned char *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ trans_store_16x16_lpf_vert14(s - 8, pitch, t_dst, 16, 1);
+
+ // Loop filtering
+ aom_lpf_horizontal_14_quad_avx2(t_dst + 8 * 16, 16, _blimit0, _limit0,
+ _thresh0);
+
+ // Transpose back
+ trans_store_16x16_lpf_vert14(t_dst, 16, s - 8, pitch, 0);
+}
diff --git a/third_party/aom/aom_dsp/x86/loopfilter_sse2.c b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
new file mode 100644
index 0000000000..cdf24c332a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/loopfilter_sse2.c
@@ -0,0 +1,2973 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+#include "aom_ports/emmintrin_compat.h"
+#include "aom_dsp/x86/lpf_common_sse2.h"
+
+static INLINE __m128i abs_diff(__m128i a, __m128i b) {
+ return _mm_or_si128(_mm_subs_epu8(a, b), _mm_subs_epu8(b, a));
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them to 4x8 independently while flipping the second matrix horizontally.
+// Used for 14 taps pq pairs creation
+static INLINE void transpose_pq_14_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *q0p0,
+ __m128i *q1p1, __m128i *q2p2,
+ __m128i *q3p3, __m128i *q4p4,
+ __m128i *q5p5, __m128i *q6p6,
+ __m128i *q7p7) {
+ __m128i w0, w1, ww0, ww1, w2, w3, ww2, ww3;
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi8(
+ *x0, *x1); // 08 18 09 19 010 110 011 111 012 112 013 113 014 114 015 115
+ w3 = _mm_unpackhi_epi8(
+ *x2, *x3); // 28 38 29 39 210 310 211 311 212 312 213 313 214 314 215 315
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ ww2 = _mm_unpacklo_epi16(
+ w2, w3); // 08 18 28 38 09 19 29 39 010 110 210 310 011 111 211 311
+ ww3 = _mm_unpackhi_epi16(
+ w2,
+ w3); // 012 112 212 312 013 113 213 313 014 114 214 314 015 115 215 315
+
+ *q7p7 = _mm_unpacklo_epi32(
+ ww0,
+ _mm_srli_si128(
+ ww3, 12)); // 00 10 20 30 015 115 215 315 xx xx xx xx xx xx xx xx
+ *q6p6 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww0, 4),
+ ww3); // 01 11 21 31 014 114 214 314 xx xx xx xxxx xx xx xx
+ *q5p5 = _mm_unpackhi_epi32(
+ ww0,
+ _mm_slli_si128(
+ ww3, 4)); // 02 12 22 32 013 113 213 313 xx xx xx x xx xx xx xxx
+ *q4p4 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww0, 12),
+ ww3); // 03 13 23 33 012 112 212 312 xx xx xx xx xx xx xx xx
+ *q3p3 = _mm_unpacklo_epi32(
+ ww1,
+ _mm_srli_si128(
+ ww2, 12)); // 04 14 24 34 011 111 211 311 xx xx xx xx xx xx xx xx
+ *q2p2 = _mm_unpackhi_epi32(
+ _mm_slli_si128(ww1, 4),
+ ww2); // 05 15 25 35 010 110 210 310 xx xx xx xx xx xx xx xx
+ *q1p1 = _mm_unpackhi_epi32(
+ ww1,
+ _mm_slli_si128(
+ ww2, 4)); // 06 16 26 36 09 19 29 39 xx xx xx xx xx xx xx xx
+ *q0p0 = _mm_unpacklo_epi32(
+ _mm_srli_si128(ww1, 12),
+ ww2); // 07 17 27 37 08 18 28 38 xx xx xx xx xx xx xx xx
+}
+
+// this function treats its input as 2 parallel 8x4 matrices, transposes each of
+// them independently while flipping the second matrix horizontaly Used for 14
+// taps filter pq pairs inverse
+static INLINE void transpose_pq_14_inv_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *pq0, __m128i *pq1,
+ __m128i *pq2, __m128i *pq3) {
+ __m128i w10, w11, w12, w13;
+ __m128i w0, w1, w2, w3, w4, w5;
+ __m128i d0, d1, d2, d3;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // p 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // p 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // p 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // p 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w10 = _mm_unpacklo_epi8(
+ *x7, *x6); // q xx xx xx xx xx xx xx xx 00 10 01 11 02 12 03 13
+ w11 = _mm_unpacklo_epi8(
+ *x5, *x4); // q xx xx xx xx xx xx xx xx 20 30 21 31 22 32 23 33
+ w12 = _mm_unpacklo_epi8(
+ *x3, *x2); // q xx xx xx xx xx xx xx xx 40 50 41 51 42 52 43 53
+ w13 = _mm_unpacklo_epi8(
+ *x1, *x0); // q xx xx xx xx xx xx xx xx 60 70 61 71 62 72 63 73
+
+ w4 = _mm_unpackhi_epi16(
+ w10, w11); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpackhi_epi16(
+ w12, w13); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ *pq0 = _mm_unpacklo_epi64(d0, d1); // pq
+ *pq1 = _mm_unpackhi_epi64(d0, d1); // pq
+ *pq2 = _mm_unpacklo_epi64(d2, d3); // pq
+ *pq3 = _mm_unpackhi_epi64(d2, d3); // pq
+}
+
+static AOM_FORCE_INLINE void filter4_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0, __m128i *ps1ps0) {
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i t3t4 =
+ _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8((char)0x80);
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 4), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter2filter1 =
+ _mm_unpacklo_epi8(filter2filter1, filter2filter1); // goto 16 bit
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter2filter1);
+
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter); // goto 16 bit
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+ filter = _mm_unpacklo_epi32(filter, filter);
+
+ filter2filter1 = _mm_unpacklo_epi32(filter2filter1, filter);
+ hev1 = _mm_srli_si128(filter2filter1, 8);
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void filter4_dual_sse2(__m128i *p1p0, __m128i *q1q0,
+ __m128i *hev, __m128i *mask,
+ __m128i *qs1qs0,
+ __m128i *ps1ps0) {
+ const __m128i t3t4 =
+ _mm_set_epi8(3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4);
+ const __m128i t80 = _mm_set1_epi8((char)0x80);
+ __m128i filter, filter2filter1, work;
+ __m128i ps1ps0_work, qs1qs0_work;
+ __m128i hev1;
+ const __m128i ff = _mm_cmpeq_epi8(t80, t80);
+
+ ps1ps0_work = _mm_xor_si128(*p1p0, t80); /* ^ 0x80 */
+ qs1qs0_work = _mm_xor_si128(*q1q0, t80);
+
+ /* int8_t filter = signed_char_clamp(ps1 - qs1) & hev; */
+ work = _mm_subs_epi8(ps1ps0_work, qs1qs0_work);
+ filter = _mm_and_si128(_mm_srli_si128(work, 8), *hev);
+ /* filter = signed_char_clamp(filter + 3 * (qs0 - ps0)) & mask; */
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work);
+ filter = _mm_subs_epi8(filter, work); /* + 3 * (qs0 - ps0) */
+ filter = _mm_and_si128(filter, *mask); /* & mask */
+ filter = _mm_unpacklo_epi64(filter, filter);
+
+ /* filter1 = signed_char_clamp(filter + 4) >> 3; */
+ /* filter2 = signed_char_clamp(filter + 3) >> 3; */
+ filter2filter1 = _mm_adds_epi8(filter, t3t4); /* signed_char_clamp */
+ filter = _mm_unpackhi_epi8(filter2filter1, filter2filter1);
+ filter2filter1 = _mm_unpacklo_epi8(filter2filter1, filter2filter1);
+ filter2filter1 = _mm_srai_epi16(filter2filter1, 11); /* >> 3 */
+ filter = _mm_srai_epi16(filter, 11); /* >> 3 */
+ filter2filter1 = _mm_packs_epi16(filter2filter1, filter);
+
+ /* filter = ROUND_POWER_OF_TWO(filter1, 1) & ~hev; */
+ filter = _mm_subs_epi8(filter2filter1, ff); /* + 1 */
+ filter = _mm_unpacklo_epi8(filter, filter);
+ filter = _mm_srai_epi16(filter, 9); /* round */
+ filter = _mm_packs_epi16(filter, filter);
+ filter = _mm_andnot_si128(*hev, filter);
+
+ hev1 = _mm_unpackhi_epi64(filter2filter1, filter);
+ filter2filter1 = _mm_unpacklo_epi64(filter2filter1, filter);
+
+ /* signed_char_clamp(qs1 - filter), signed_char_clamp(qs0 - filter1) */
+ qs1qs0_work = _mm_subs_epi8(qs1qs0_work, filter2filter1);
+ /* signed_char_clamp(ps1 + filter), signed_char_clamp(ps0 + filter2) */
+ ps1ps0_work = _mm_adds_epi8(ps1ps0_work, hev1);
+ *qs1qs0 = _mm_xor_si128(qs1qs0_work, t80); /* ^ 0x80 */
+ *ps1ps0 = _mm_xor_si128(ps1ps0_work, t80); /* ^ 0x80 */
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
+ __m128i mask, flat, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_srli_si128(abs_p1q1p0q0, 4); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_unpacklo_epi8(abs_p1q1, abs_p1q1);
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi32(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 4));
+
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_4_dual_sse2(
+ __m128i *p1, __m128i *p0, __m128i *q0, __m128i *q1, __m128i *limit,
+ __m128i *thresh, __m128i *q1q0_out, __m128i *p1p0_out) {
+ __m128i q1p1, q0p0, p1p0, q1q0;
+ __m128i abs_p0q0, abs_p1q1;
+ __m128i mask, hev;
+ const __m128i zero = _mm_setzero_si128();
+
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ /* (abs(q1 - q0), abs(p1 - p0) */
+ __m128i flat = abs_diff(q1p1, q0p0);
+ /* abs(p1 - q1), abs(p0 - q0) */
+ const __m128i abs_p1q1p0q0 = abs_diff(p1p0, q1q0);
+
+ /* const uint8_t hev = hev_mask(thresh, *op1, *op0, *oq0, *oq1); */
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ hev = _mm_unpacklo_epi8(flat, zero);
+
+ hev = _mm_cmpgt_epi16(hev, *thresh);
+ hev = _mm_packs_epi16(hev, hev);
+
+ /* const int8_t mask = filter_mask2(*limit, *blimit, */
+ /* p1, p0, q0, q1); */
+ abs_p0q0 = _mm_adds_epu8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p0 - q0) * 2 */
+ abs_p1q1 = _mm_unpackhi_epi8(abs_p1q1p0q0, abs_p1q1p0q0); /* abs(p1 - q1) */
+ abs_p1q1 = _mm_srli_epi16(abs_p1q1, 9);
+ abs_p1q1 = _mm_packs_epi16(abs_p1q1, abs_p1q1); /* abs(p1 - q1) / 2 */
+ /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */
+ mask = _mm_adds_epu8(abs_p0q0, abs_p1q1);
+ mask = _mm_unpacklo_epi64(mask, flat);
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ mask = _mm_and_si128(mask, _mm_srli_si128(mask, 8));
+
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+}
+
+void aom_lpf_horizontal_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+ __m128i qs1qs0, ps1ps0;
+ __m128i p1, p0, q0, q1;
+
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &qs1qs0, &ps1ps0);
+
+ xx_storel_32(s - 1 * p, ps1ps0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(ps1ps0, 4));
+ xx_storel_32(s + 0 * p, qs1qs0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(qs1qs0, 4));
+}
+
+void aom_lpf_vertical_4_sse2(uint8_t *s, int p /* pitch */,
+ const uint8_t *_blimit, const uint8_t *_limit,
+ const uint8_t *_thresh) {
+ __m128i p1p0, q1q0;
+ __m128i p1, p0, q0, q1;
+
+ const __m128i zero = _mm_setzero_si128();
+ __m128i limit = _mm_unpacklo_epi32(_mm_loadl_epi64((const __m128i *)_blimit),
+ _mm_loadl_epi64((const __m128i *)_limit));
+ __m128i thresh =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh), zero);
+
+ __m128i x0, x1, x2, x3;
+ __m128i d0, d1, d2, d3;
+ x0 = _mm_loadl_epi64((__m128i *)(s - 2 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 2 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 2 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 2 + 3 * p));
+
+ transpose4x8_8x4_low_sse2(&x0, &x1, &x2, &x3, &p1, &p0, &q0, &q1);
+
+ lpf_internal_4_sse2(&p1, &p0, &q0, &q1, &limit, &thresh, &q1q0, &p1p0);
+
+ // Transpose 8x4 to 4x8
+ p1 = _mm_srli_si128(p1p0, 4);
+ q1 = _mm_srli_si128(q1q0, 4);
+
+ transpose4x8_8x4_low_sse2(&p1, &p1p0, &q1q0, &q1, &d0, &d1, &d2, &d3);
+
+ xx_storel_32(s + 0 * p - 2, d0);
+ xx_storel_32(s + 1 * p - 2, d1);
+ xx_storel_32(s + 2 * p - 2, d2);
+ xx_storel_32(s + 3 * p - 2, d3);
+}
+
+static INLINE void store_buffer_horz_8(__m128i x, int p, int num, uint8_t *s) {
+ xx_storel_32(s - (num + 1) * p, x);
+ xx_storel_32(s + num * p, _mm_srli_si128(x, 4));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_dual_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
+
+ p1p0 = _mm_unpacklo_epi64(*q0p0, *q1p1);
+ q1q0 = _mm_unpackhi_epi64(*q0p0, *q1p1);
+
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ __m128i fe, ff, work;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+ fe = _mm_set1_epi8((char)0xfe);
+ ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi64(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_unpackhi_epi64(ps1ps0, qs1qs0);
+ // loopfilter done
+
+ __m128i flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
+ __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;
+
+ __m128i work;
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
+ __m128i q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pixelFilter_p, pixelFilter_q;
+ __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
+ __m128i sum_p6, sum_q6;
+ __m128i sum_p3, sum_q3, res_p, res_q;
+
+ p6_16 = _mm_unpacklo_epi8(*q6p6, zero);
+ p5_16 = _mm_unpacklo_epi8(*q5p5, zero);
+ p4_16 = _mm_unpacklo_epi8(*q4p4, zero);
+ p3_16 = _mm_unpacklo_epi8(*q3p3, zero);
+ p2_16 = _mm_unpacklo_epi8(*q2p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*q1p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*q0p0, zero);
+ q0_16 = _mm_unpackhi_epi8(*q0p0, zero);
+ q1_16 = _mm_unpackhi_epi8(*q1p1, zero);
+ q2_16 = _mm_unpackhi_epi8(*q2p2, zero);
+ q3_16 = _mm_unpackhi_epi8(*q3p3, zero);
+ q4_16 = _mm_unpackhi_epi8(*q4p4, zero);
+ q5_16 = _mm_unpackhi_epi8(*q5p5, zero);
+ q6_16 = _mm_unpackhi_epi8(*q6p6, zero);
+ pixelFilter_p = _mm_add_epi16(p5_16, _mm_add_epi16(p4_16, p3_16));
+ pixelFilter_q = _mm_add_epi16(q5_16, _mm_add_epi16(q4_16, q3_16));
+
+ pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
+ pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);
+
+ pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
+ pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
+ pixelFilter_p =
+ _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
+ pixetFilter_p2p1p0 = _mm_add_epi16(
+ four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(_mm_add_epi16(p6_16, p0_16),
+ _mm_add_epi16(p1_16, q0_16))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixelFilter_p,
+ _mm_add_epi16(_mm_add_epi16(q6_16, q0_16),
+ _mm_add_epi16(p0_16, q1_16))),
+ 4);
+ flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);
+
+ flat_q0p0 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(p6_16, p6_16);
+ sum_q6 = _mm_add_epi16(q6_16, q6_16);
+ sum_p3 = _mm_add_epi16(p3_16, p3_16);
+ sum_q3 = _mm_add_epi16(q3_16, q3_16);
+
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p5_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p1_16, _mm_add_epi16(p2_16, p0_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q1_16, _mm_add_epi16(q0_16, q2_16)))),
+ 4);
+ flat2_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
+ flat_q1p1 = _mm_packus_epi16(res_p, res_q);
+
+ pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
+ pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);
+
+ sum_p3 = _mm_add_epi16(sum_p3, p3_16);
+ sum_q3 = _mm_add_epi16(sum_q3, q3_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
+ flat_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ // work with flat2
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat = _mm_unpacklo_epi64(flat, flat);
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat_q2p2);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);
+
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p2_16, _mm_add_epi16(p3_16, p1_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q2_16, _mm_add_epi16(q1_16, q3_16)))),
+ 4);
+ flat2_q2p2 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p3_16, _mm_add_epi16(p4_16, p2_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q3_16, _mm_add_epi16(q2_16, q4_16)))),
+ 4);
+ flat2_q3p3 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p4_16, _mm_add_epi16(p5_16, p3_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q4_16, _mm_add_epi16(q3_16, q5_16)))),
+ 4);
+ flat2_q4p4 = _mm_packus_epi16(res_p, res_q);
+
+ sum_p6 = _mm_add_epi16(sum_p6, p6_16);
+ sum_q6 = _mm_add_epi16(sum_q6, q6_16);
+ pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
+ pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
+
+ res_p = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_p,
+ _mm_add_epi16(sum_p6,
+ _mm_add_epi16(p5_16, _mm_add_epi16(p6_16, p4_16)))),
+ 4);
+ res_q = _mm_srli_epi16(
+ _mm_add_epi16(
+ pixelFilter_q,
+ _mm_add_epi16(sum_q6,
+ _mm_add_epi16(q5_16, _mm_add_epi16(q6_16, q4_16)))),
+ 4);
+ flat2_q5p5 = _mm_packus_epi16(res_p, res_q);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_unpacklo_epi64(flat2, flat2);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_q5p5);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_q4p4);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_q3p3);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_q2p2);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_q1p1);
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_q0p0);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_14_sse2(
+ __m128i *q6p6, __m128i *q5p5, __m128i *q4p4, __m128i *q3p3, __m128i *q2p2,
+ __m128i *q1p1, __m128i *q0p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ __m128i mask, hev, flat, flat2;
+ __m128i flat2_pq[6], flat_pq[3];
+ __m128i qs0ps0, qs1ps1;
+ __m128i p1p0, q1q0, qs1qs0, ps1ps0;
+ __m128i abs_p1p0;
+
+ p1p0 = _mm_unpacklo_epi32(*q0p0, *q1p1);
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ __m128i fe, ff, work;
+ {
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0;
+ abs_p1p0 = abs_diff(*q1p1, *q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+ fe = _mm_set1_epi8((char)0xfe);
+ ff = _mm_cmpeq_epi8(fe, fe);
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(*q2p2, *q1p1), abs_diff(*q3p3, *q2p2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, &qs1qs0, &ps1ps0);
+ qs0ps0 = _mm_unpacklo_epi32(ps1ps0, qs1qs0);
+ qs1ps1 = _mm_srli_si128(qs0ps0, 8);
+ // loopfilter done
+
+ flat = _mm_max_epu8(abs_diff(*q2p2, *q0p0), abs_diff(*q3p3, *q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // if flat ==0 then flat2 is zero as well and we don't need any calc below
+ // sse4.1 if (0==_mm_test_all_zeros(flat,ff))
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ __m128i q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
+ __m128i pq_16[7];
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i sum_p6;
+ __m128i sum_p3;
+
+ pq_16[0] = _mm_unpacklo_epi8(*q0p0, zero);
+ pq_16[1] = _mm_unpacklo_epi8(*q1p1, zero);
+ pq_16[2] = _mm_unpacklo_epi8(*q2p2, zero);
+ pq_16[3] = _mm_unpacklo_epi8(*q3p3, zero);
+ pq_16[4] = _mm_unpacklo_epi8(*q4p4, zero);
+ pq_16[5] = _mm_unpacklo_epi8(*q5p5, zero);
+ pq_16[6] = _mm_unpacklo_epi8(*q6p6, zero);
+ q0_16 = _mm_srli_si128(pq_16[0], 8);
+ q1_16 = _mm_srli_si128(pq_16[1], 8);
+ q2_16 = _mm_srli_si128(pq_16[2], 8);
+ q3_16 = _mm_srli_si128(pq_16[3], 8);
+ q4_16 = _mm_srli_si128(pq_16[4], 8);
+ q5_16 = _mm_srli_si128(pq_16[5], 8);
+
+ __m128i flat_p[3], flat_q[3];
+ __m128i flat2_p[6], flat2_q[6];
+
+ __m128i work0, work0_0, work0_1, sum_p_0;
+ __m128i sum_p = _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[4], pq_16[3]));
+ __m128i sum_lp = _mm_add_epi16(pq_16[0], _mm_add_epi16(pq_16[2], pq_16[1]));
+ sum_p = _mm_add_epi16(sum_p, sum_lp);
+
+ __m128i sum_lq = _mm_srli_si128(sum_lp, 8);
+ __m128i sum_q = _mm_srli_si128(sum_p, 8);
+
+ sum_p_0 = _mm_add_epi16(eight, _mm_add_epi16(sum_p, sum_q));
+ sum_lp = _mm_add_epi16(four, _mm_add_epi16(sum_lp, sum_lq));
+
+ flat_p[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(pq_16[3], pq_16[0]));
+ flat_q[0] = _mm_add_epi16(sum_lp, _mm_add_epi16(q3_16, q0_16));
+
+ sum_p6 = _mm_add_epi16(pq_16[6], pq_16[6]);
+ sum_p3 = _mm_add_epi16(pq_16[3], pq_16[3]);
+
+ sum_q = _mm_sub_epi16(sum_p_0, pq_16[5]);
+ sum_p = _mm_sub_epi16(sum_p_0, q5_16);
+
+ work0_0 = _mm_add_epi16(_mm_add_epi16(pq_16[6], pq_16[0]), pq_16[1]);
+ work0_1 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[1], _mm_add_epi16(pq_16[2], pq_16[0])));
+
+ sum_lq = _mm_sub_epi16(sum_lp, pq_16[2]);
+ sum_lp = _mm_sub_epi16(sum_lp, q2_16);
+
+ work0 = _mm_add_epi16(sum_p3, pq_16[1]);
+ flat_p[1] = _mm_add_epi16(sum_lp, work0);
+ flat_q[1] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+
+ flat_pq[0] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[0], flat_q[0]), 3);
+ flat_pq[1] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[1], flat_q[1]), 3);
+ flat_pq[0] = _mm_packus_epi16(flat_pq[0], flat_pq[0]);
+ flat_pq[1] = _mm_packus_epi16(flat_pq[1], flat_pq[1]);
+
+ sum_lp = _mm_sub_epi16(sum_lp, q1_16);
+ sum_lq = _mm_sub_epi16(sum_lq, pq_16[1]);
+
+ sum_p3 = _mm_add_epi16(sum_p3, pq_16[3]);
+ work0 = _mm_add_epi16(sum_p3, pq_16[2]);
+
+ flat_p[2] = _mm_add_epi16(sum_lp, work0);
+ flat_q[2] = _mm_add_epi16(sum_lq, _mm_srli_si128(work0, 8));
+ flat_pq[2] = _mm_srli_epi16(_mm_unpacklo_epi64(flat_p[2], flat_q[2]), 3);
+ flat_pq[2] = _mm_packus_epi16(flat_pq[2], flat_pq[2]);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ flat2 = _mm_max_epu8(abs_diff(*q4p4, *q0p0), abs_diff(*q5p5, *q0p0));
+
+ work = abs_diff(*q6p6, *q0p0);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 4));
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ flat2 = _mm_unpacklo_epi32(flat2, flat2);
+
+ // ~~~~~~~~~~ apply flat ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
+ flat_pq[0] = _mm_and_si128(flat, flat_pq[0]);
+ *q0p0 = _mm_or_si128(qs0ps0, flat_pq[0]);
+
+ qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
+ flat_pq[1] = _mm_and_si128(flat, flat_pq[1]);
+ *q1p1 = _mm_or_si128(qs1ps1, flat_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat, *q2p2);
+ flat_pq[2] = _mm_and_si128(flat, flat_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat_pq[2]);
+
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ flat2_p[0] = _mm_add_epi16(sum_p_0, _mm_add_epi16(work0_0, q0_16));
+ flat2_q[0] = _mm_add_epi16(
+ sum_p_0, _mm_add_epi16(_mm_srli_si128(work0_0, 8), pq_16[0]));
+
+ flat2_p[1] = _mm_add_epi16(sum_p, work0_1);
+ flat2_q[1] = _mm_add_epi16(sum_q, _mm_srli_si128(work0_1, 8));
+
+ flat2_pq[0] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[0], flat2_q[0]), 4);
+ flat2_pq[1] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[1], flat2_q[1]), 4);
+ flat2_pq[0] = _mm_packus_epi16(flat2_pq[0], flat2_pq[0]);
+ flat2_pq[1] = _mm_packus_epi16(flat2_pq[1], flat2_pq[1]);
+
+ sum_p = _mm_sub_epi16(sum_p, q4_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[2], _mm_add_epi16(pq_16[3], pq_16[1])));
+ flat2_p[2] = _mm_add_epi16(sum_p, work0);
+ flat2_q[2] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[2] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[2], flat2_q[2]), 4);
+ flat2_pq[2] = _mm_packus_epi16(flat2_pq[2], flat2_pq[2]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q3_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[3]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[3], _mm_add_epi16(pq_16[4], pq_16[2])));
+ flat2_p[3] = _mm_add_epi16(sum_p, work0);
+ flat2_q[3] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[3] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[3], flat2_q[3]), 4);
+ flat2_pq[3] = _mm_packus_epi16(flat2_pq[3], flat2_pq[3]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q2_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[2]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[4], _mm_add_epi16(pq_16[5], pq_16[3])));
+ flat2_p[4] = _mm_add_epi16(sum_p, work0);
+ flat2_q[4] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[4] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[4], flat2_q[4]), 4);
+ flat2_pq[4] = _mm_packus_epi16(flat2_pq[4], flat2_pq[4]);
+
+ sum_p6 = _mm_add_epi16(sum_p6, pq_16[6]);
+ sum_p = _mm_sub_epi16(sum_p, q1_16);
+ sum_q = _mm_sub_epi16(sum_q, pq_16[1]);
+
+ work0 = _mm_add_epi16(
+ sum_p6, _mm_add_epi16(pq_16[5], _mm_add_epi16(pq_16[6], pq_16[4])));
+ flat2_p[5] = _mm_add_epi16(sum_p, work0);
+ flat2_q[5] = _mm_add_epi16(sum_q, _mm_srli_si128(work0, 8));
+ flat2_pq[5] =
+ _mm_srli_epi16(_mm_unpacklo_epi64(flat2_p[5], flat2_q[5]), 4);
+ flat2_pq[5] = _mm_packus_epi16(flat2_pq[5], flat2_pq[5]);
+
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ *q0p0 = _mm_andnot_si128(flat2, *q0p0);
+ flat2_pq[0] = _mm_and_si128(flat2, flat2_pq[0]);
+ *q0p0 = _mm_or_si128(*q0p0, flat2_pq[0]);
+
+ *q1p1 = _mm_andnot_si128(flat2, *q1p1);
+ flat2_pq[1] = _mm_and_si128(flat2, flat2_pq[1]);
+ *q1p1 = _mm_or_si128(*q1p1, flat2_pq[1]);
+
+ *q2p2 = _mm_andnot_si128(flat2, *q2p2);
+ flat2_pq[2] = _mm_and_si128(flat2, flat2_pq[2]);
+ *q2p2 = _mm_or_si128(*q2p2, flat2_pq[2]);
+
+ *q3p3 = _mm_andnot_si128(flat2, *q3p3);
+ flat2_pq[3] = _mm_and_si128(flat2, flat2_pq[3]);
+ *q3p3 = _mm_or_si128(*q3p3, flat2_pq[3]);
+
+ *q4p4 = _mm_andnot_si128(flat2, *q4p4);
+ flat2_pq[4] = _mm_and_si128(flat2, flat2_pq[4]);
+ *q4p4 = _mm_or_si128(*q4p4, flat2_pq[4]);
+
+ *q5p5 = _mm_andnot_si128(flat2, *q5p5);
+ flat2_pq[5] = _mm_and_si128(flat2, flat2_pq[5]);
+ *q5p5 = _mm_or_si128(*q5p5, flat2_pq[5]);
+ }
+ } else {
+ *q0p0 = qs0ps0;
+ *q1p1 = qs1ps1;
+ }
+}
+
+void aom_lpf_horizontal_14_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ q4p4 = _mm_unpacklo_epi32(xx_loadl_32(s - 5 * p), xx_loadl_32(s + 4 * p));
+ q3p3 = _mm_unpacklo_epi32(xx_loadl_32(s - 4 * p), xx_loadl_32(s + 3 * p));
+ q2p2 = _mm_unpacklo_epi32(xx_loadl_32(s - 3 * p), xx_loadl_32(s + 2 * p));
+ q1p1 = _mm_unpacklo_epi32(xx_loadl_32(s - 2 * p), xx_loadl_32(s + 1 * p));
+
+ q0p0 = _mm_unpacklo_epi32(xx_loadl_32(s - 1 * p), xx_loadl_32(s - 0 * p));
+
+ q5p5 = _mm_unpacklo_epi32(xx_loadl_32(s - 6 * p), xx_loadl_32(s + 5 * p));
+
+ q6p6 = _mm_unpacklo_epi32(xx_loadl_32(s - 7 * p), xx_loadl_32(s + 6 * p));
+
+ lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+ &limit, &thresh);
+
+ store_buffer_horz_8(q0p0, p, 0, s);
+ store_buffer_horz_8(q1p1, p, 1, s);
+ store_buffer_horz_8(q2p2, p, 2, s);
+ store_buffer_horz_8(q3p3, p, 3, s);
+ store_buffer_horz_8(q4p4, p, 4, s);
+ store_buffer_horz_8(q5p5, p, 5, s);
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_dual_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i p2_16, q2_16, p1_16, q1_16, p0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+
+ q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ *p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ *q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, zero);
+
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+
+ // op1
+ workp_a = _mm_add_epi16(_mm_add_epi16(p0_16, p0_16),
+ _mm_add_epi16(p1_16, p1_16)); // p0 *2 + p1 * 2
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four),
+ p2_16); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(p2_16, p2_16), q0_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+ 3); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q0_16), q1_16); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_b); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_shft1 = _mm_srli_epi16(workp_a, 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p2_16),
+ p1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_add_epi16(q1_16, q2_16);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, p1_16),
+ p0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b),
+ 3); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_6_sse2(
+ __m128i *p2, __m128i *q2, __m128i *p1, __m128i *q1, __m128i *p0,
+ __m128i *q0, __m128i *q1q0, __m128i *p1p0, __m128i *blimit, __m128i *limit,
+ __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i q2p2, q1p1, q0p0, flat_p1p0, flat_q0q1;
+ __m128i pq2_16, q2_16, pq1_16, pq0_16, q0_16;
+ __m128i ps1ps0, qs1qs0;
+
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ *p1p0 = _mm_unpacklo_epi32(*p0, *p1);
+ *q1q0 = _mm_unpacklo_epi32(*q0, *q1);
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ {
+ // filter_mask and hev_mask
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(*p1p0, *q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ // considering sse doesn't have unsigned elements comparison the idea is
+ // to find at least one case when X > limit, it means the corresponding
+ // mask bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = abs_diff(q2p2, q1p1);
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(p1p0, q1q0, &hev, &mask, q1q0, p1p0);
+
+ // flat_mask
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_p1p0);
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // 5 tap filter
+ // need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c;
+ __m128i pq0x2_pq1, pq1_pq2;
+ pq2_16 = _mm_unpacklo_epi8(q2p2, zero);
+ pq1_16 = _mm_unpacklo_epi8(q1p1, zero);
+ pq0_16 = _mm_unpacklo_epi8(q0p0, zero);
+ q0_16 = _mm_srli_si128(pq0_16, 8);
+ q2_16 = _mm_srli_si128(pq2_16, 8);
+
+ // op1
+ pq0x2_pq1 =
+ _mm_add_epi16(_mm_add_epi16(pq0_16, pq0_16), pq1_16); // p0 *2 + p1
+ pq1_pq2 = _mm_add_epi16(pq1_16, pq2_16); // p1 + p2
+ workp_a = _mm_add_epi16(_mm_add_epi16(pq0x2_pq1, four),
+ pq1_pq2); // p2 + p0 * 2 + p1 * 2 + 4
+
+ workp_b = _mm_add_epi16(_mm_add_epi16(pq2_16, pq2_16), q0_16);
+ workp_b =
+ _mm_add_epi16(workp_a, workp_b); // p2 * 3 + p1 * 2 + p0 * 2 + q0 + 4
+
+ // op0
+ workp_c = _mm_srli_si128(pq0x2_pq1, 8); // q0 * 2 + q1
+ workp_a = _mm_add_epi16(workp_a,
+ workp_c); // p2 + p0 * 2 + p1 * 2 + q0 * 2 + q1 + 4
+ workp_b = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_b = _mm_srli_epi16(workp_b, 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_b, workp_b);
+
+ // oq0
+ workp_a = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq2_16),
+ pq1_16); // p0 * 2 + p1 + q0 * 2 + q1 + 4
+ workp_b = _mm_srli_si128(pq1_pq2, 8);
+ workp_a = _mm_add_epi16(
+ workp_a, workp_b); // p0 * 2 + p1 + q0 * 2 + q1 * 2 + q2 + 4
+ // workp_shft0 = _mm_srli_epi16(workp_a, 3);
+
+ // oq1
+ workp_c = _mm_sub_epi16(_mm_sub_epi16(workp_a, pq1_16),
+ pq0_16); // p0 + q0 * 2 + q1 * 2 + q2 + 4
+ workp_b = _mm_add_epi16(q2_16, q2_16);
+ workp_b =
+ _mm_add_epi16(workp_c, workp_b); // p0 + q0 * 2 + q1 * 2 + q2 * 3 + 4
+
+ workp_a = _mm_unpacklo_epi64(workp_a, workp_b);
+ workp_a = _mm_srli_epi16(workp_a, 3);
+
+ flat_q0q1 = _mm_packus_epi16(workp_a, workp_a);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0);
+ *q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0 = _mm_or_si128(qs1qs0, *q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0);
+ *p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0 = _mm_or_si128(ps1ps0, *p1p0);
+ }
+}
+
+void aom_lpf_horizontal_6_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i p1p0, q1q0;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ p2 = xx_loadl_32(s - 3 * p);
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+ q2 = xx_loadl_32(s + 2 * p);
+
+ lpf_internal_6_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ xx_storel_32(s - 1 * p, p1p0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+ xx_storel_32(s + 0 * p, q1q0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+}
+
+void aom_lpf_horizontal_6_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i p2, p1, p0, q0, q1, q2;
+ __m128i p1p0, q1q0;
+
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+
+ lpf_internal_6_dual_sse2(&p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
+
+ q3p3 = _mm_unpacklo_epi32(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi32(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi32(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi32(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi32(q0p0, q1p1); // p1p0 q1q0
+ q1q0 = _mm_srli_si128(p1p0, 8);
+
+ // filter_mask and hev_mask
+
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 4);
+
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 4);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi32(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_unpacklo_epi32(mask, zero);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 4));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 4));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi32(flat, flat);
+ flat = _mm_unpacklo_epi64(flat, flat);
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i workp_a, workp_b, workp_c, workp_d, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_add_epi16(workp_a, workp_b);
+
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_d, workp_c);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_p1p0 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ // workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+ workp_c = _mm_add_epi16(workp_a, workp_b);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_d = _mm_add_epi16(workp_a, workp_b);
+ // workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ workp_c = _mm_unpacklo_epi64(workp_c, workp_d);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+ flat_q0q1 = _mm_packus_epi16(workp_c, workp_c);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_add_epi16(workp_a, workp_b);
+
+ workp_c = _mm_unpacklo_epi64(workp_shft2, workp_shft1);
+ workp_c = _mm_srli_epi16(workp_c, 3);
+
+ opq2 = _mm_packus_epi16(workp_c, workp_c);
+
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 4);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+static AOM_FORCE_INLINE void lpf_internal_8_dual_sse2(
+ __m128i *p3, __m128i *q3, __m128i *p2, __m128i *q2, __m128i *p1,
+ __m128i *q1, __m128i *p0, __m128i *q0, __m128i *q1q0_out, __m128i *p1p0_out,
+ __m128i *blimit, __m128i *limit, __m128i *thresh) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i mask, hev, flat;
+ __m128i p2_16, q2_16, p1_16, p0_16, q0_16, q1_16, p3_16, q3_16, q3p3,
+ flat_p1p0, flat_q0q1;
+ __m128i q2p2, q1p1, q0p0;
+ __m128i q1q0, p1p0, ps1ps0, qs1qs0;
+ __m128i work_pq, opq2, pq2;
+
+ q3p3 = _mm_unpacklo_epi64(*p3, *q3);
+ q2p2 = _mm_unpacklo_epi64(*p2, *q2);
+ q1p1 = _mm_unpacklo_epi64(*p1, *q1);
+ q0p0 = _mm_unpacklo_epi64(*p0, *q0);
+
+ p1p0 = _mm_unpacklo_epi64(q0p0, q1p1);
+ q1q0 = _mm_unpackhi_epi64(q0p0, q1p1);
+
+ {
+ // filter_mask and hev_mask
+
+ // considering sse doesn't have unsigned elements comparison the idea is to
+ // find at least one case when X > limit, it means the corresponding mask
+ // bit is set.
+ // to achieve that we find global max value of all inputs of abs(x-y) or
+ // (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 If it is > limit the mask is set
+ // otherwise - not
+
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i fe = _mm_set1_epi8((char)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+ __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+
+ abs_p1p0 = abs_diff(q1p1, q0p0);
+ abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
+
+ abs_p0q0 = abs_diff(p1p0, q1q0);
+ abs_p1q1 = _mm_srli_si128(abs_p0q0, 8);
+ abs_p0q0 = _mm_unpacklo_epi64(abs_p0q0, abs_p0q0);
+
+ flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+ hev = _mm_subs_epu8(flat, *thresh);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ // replicate for the further "merged variables" usage
+ hev = _mm_unpacklo_epi64(hev, hev);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), *blimit);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(abs_p1p0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+
+ work = _mm_max_epu8(abs_diff(q2p2, q1p1), abs_diff(q3p3, q2p2));
+
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
+ mask = _mm_subs_epu8(mask, *limit);
+ mask = _mm_cmpeq_epi8(mask, zero);
+
+ // lp filter - the same for 6, 8 and 14 versions
+ filter4_dual_sse2(&p1p0, &q1q0, &hev, &mask, q1q0_out, p1p0_out);
+
+ // flat_mask4
+ flat = _mm_max_epu8(abs_diff(q2p2, q0p0), abs_diff(q3p3, q0p0));
+ flat = _mm_max_epu8(abs_p1p0, flat);
+
+ flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ // replicate for the further "merged variables" usage
+ flat = _mm_unpacklo_epi64(flat, flat);
+ }
+
+ // filter8 need it only if flat !=0
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+
+ __m128i workp_a, workp_b, workp_shft0, workp_shft1, workp_shft2;
+ p2_16 = _mm_unpacklo_epi8(*p2, zero);
+ p1_16 = _mm_unpacklo_epi8(*p1, zero);
+ p0_16 = _mm_unpacklo_epi8(*p0, zero);
+ q0_16 = _mm_unpacklo_epi8(*q0, zero);
+ q1_16 = _mm_unpacklo_epi8(*q1, zero);
+ q2_16 = _mm_unpacklo_epi8(*q2, zero);
+ p3_16 = _mm_unpacklo_epi8(*p3, zero);
+ q3_16 = _mm_unpacklo_epi8(*q3, zero);
+
+ // op2
+ workp_a =
+ _mm_add_epi16(_mm_add_epi16(p3_16, p3_16), _mm_add_epi16(p2_16, p1_16));
+ workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0_16);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, p2_16), p3_16);
+ workp_shft2 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op1
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0_16, q1_16), p1_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // op0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q2_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1_16), p0_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ flat_p1p0 = _mm_packus_epi16(workp_shft1, workp_shft0);
+
+ // oq0
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0_16), q0_16);
+ workp_shft0 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ // oq1
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0_16), q1_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ flat_q0q1 = _mm_packus_epi16(workp_shft0, workp_shft1);
+
+ // oq2
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1_16), q3_16);
+ workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1_16), q2_16);
+ workp_shft1 = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+
+ opq2 = _mm_packus_epi16(workp_shft2, workp_shft1);
+
+ work_pq = _mm_andnot_si128(flat, q2p2);
+ pq2 = _mm_and_si128(flat, opq2);
+ *p2 = _mm_or_si128(work_pq, pq2);
+ *q2 = _mm_srli_si128(*p2, 8);
+
+ qs1qs0 = _mm_andnot_si128(flat, *q1q0_out);
+ q1q0 = _mm_and_si128(flat, flat_q0q1);
+ *q1q0_out = _mm_or_si128(qs1qs0, q1q0);
+
+ ps1ps0 = _mm_andnot_si128(flat, *p1p0_out);
+ p1p0 = _mm_and_si128(flat, flat_p1p0);
+ *p1p0_out = _mm_or_si128(ps1ps0, p1p0);
+ }
+}
+
+void aom_lpf_horizontal_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+ __m128i q1q0, p1p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ p3 = xx_loadl_32(s - 4 * p);
+ p2 = xx_loadl_32(s - 3 * p);
+ p1 = xx_loadl_32(s - 2 * p);
+ p0 = xx_loadl_32(s - 1 * p);
+ q0 = xx_loadl_32(s - 0 * p);
+ q1 = xx_loadl_32(s + 1 * p);
+ q2 = xx_loadl_32(s + 2 * p);
+ q3 = xx_loadl_32(s + 3 * p);
+
+ lpf_internal_8_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ xx_storel_32(s - 1 * p, p1p0);
+ xx_storel_32(s - 2 * p, _mm_srli_si128(p1p0, 4));
+ xx_storel_32(s + 0 * p, q1q0);
+ xx_storel_32(s + 1 * p, _mm_srli_si128(q1q0, 4));
+ xx_storel_32(s - 3 * p, p2);
+ xx_storel_32(s + 2 * p, q2);
+}
+
+void aom_lpf_horizontal_14_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ q4p4 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 5 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 4 * p)));
+ q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+ q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+ q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+
+ q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+ _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+
+ q5p5 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 6 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 5 * p)));
+
+ q6p6 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 7 * p)),
+ _mm_loadl_epi64((__m128i *)(s + 6 * p)));
+
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
+ _mm_storel_epi64((__m128i *)(s + 0 * p), _mm_srli_si128(q0p0, 8));
+ _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1p1, 8));
+ _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), _mm_srli_si128(q2p2, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
+ _mm_storel_epi64((__m128i *)(s + 3 * p), _mm_srli_si128(q3p3, 8));
+ _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
+ _mm_storel_epi64((__m128i *)(s + 4 * p), _mm_srli_si128(q4p4, 8));
+ _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
+ _mm_storel_epi64((__m128i *)(s + 5 * p), _mm_srli_si128(q5p5, 8));
+}
+
+void aom_lpf_horizontal_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i p2, p1, p0, q0, q1, q2, p3, q3;
+ __m128i q1q0, p1p0;
+
+ p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
+ p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+ q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
+ q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+ lpf_internal_8_dual_sse2(&p3, &q3, &p2, &q2, &p1, &q1, &p0, &q0, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), p1p0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(p1p0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), q1q0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(q1q0, 8));
+ _mm_storel_epi64((__m128i *)(s - 3 * p), p2);
+ _mm_storel_epi64((__m128i *)(s + 2 * p), q2);
+}
+
+void aom_lpf_horizontal_4_dual_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0,
+ const unsigned char *_blimit1,
+ const unsigned char *_limit1,
+ const unsigned char *_thresh1) {
+ __m128i p1, p0, q0, q1;
+ __m128i qs1qs0, ps1ps0;
+
+ p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
+ p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
+ q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
+ q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+ _mm_storel_epi64((__m128i *)(s - 1 * p), ps1ps0);
+ _mm_storel_epi64((__m128i *)(s - 2 * p), _mm_srli_si128(ps1ps0, 8));
+ _mm_storel_epi64((__m128i *)(s + 0 * p), qs1qs0);
+ _mm_storel_epi64((__m128i *)(s + 1 * p), _mm_srli_si128(qs1qs0, 8));
+}
+
+void aom_lpf_vertical_4_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i p0, q0, q1, p1;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i qs1qs0, ps1ps0;
+
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ const __m128i limit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+
+ __m128i l = _mm_unpacklo_epi64(blimit, limit);
+
+ __m128i thresh0 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh0), zero);
+
+ __m128i thresh1 =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)_thresh1), zero);
+
+ __m128i t = _mm_unpacklo_epi64(thresh0, thresh1);
+
+ x0 = _mm_loadl_epi64((__m128i *)((s - 2)));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 2) + p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 2) + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)((s - 2) + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)((s - 2) + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)((s - 2) + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)((s - 2) + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)((s - 2) + 7 * p));
+
+ transpose8x8_low_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &p1, &p0, &q0,
+ &q1);
+
+ lpf_internal_4_dual_sse2(&p1, &p0, &q0, &q1, &l, &t, &qs1qs0, &ps1ps0);
+
+ p1 = _mm_srli_si128(ps1ps0, 8);
+ q1 = _mm_srli_si128(qs1qs0, 8);
+
+ transpose4x8_8x4_sse2(&p1, &ps1ps0, &qs1qs0, &q1, &d0, &d1, &d2, &d3, &d4,
+ &d5, &d6, &d7);
+
+ xx_storel_32((s - 2 + 0 * p), d0);
+ xx_storel_32((s - 2 + 1 * p), d1);
+ xx_storel_32((s - 2 + 2 * p), d2);
+ xx_storel_32((s - 2 + 3 * p), d3);
+ xx_storel_32((s - 2 + 4 * p), d4);
+ xx_storel_32((s - 2 + 5 * p), d5);
+ xx_storel_32((s - 2 + 6 * p), d6);
+ xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_6_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x2, x1, x0, x3;
+ __m128i p0, q0;
+ __m128i p1p0, q1q0;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+ x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+
+ transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+
+ lpf_internal_6_sse2(&d0, &d5, &d1, &d4, &d2, &d3, &q1q0, &p1p0, &blimit,
+ &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
+
+ transpose4x8_8x4_low_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3);
+
+ xx_storel_32(s + 0 * p - 2, d0);
+ xx_storel_32(s + 1 * p - 2, d1);
+ xx_storel_32(s + 2 * p - 2, d2);
+ xx_storel_32(s + 3 * p - 2, d3);
+}
+
+void aom_lpf_vertical_6_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i p0, q0;
+ __m128i p1p0, q1q0;
+ __m128i d0d1, d2d3, d4d5, d6d7;
+
+ x0 = _mm_loadl_epi64((__m128i *)((s - 3) + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 3) + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 3) + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)((s - 3) + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)((s - 3) + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)((s - 3) + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)((s - 3) + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)((s - 3) + 7 * p));
+
+ transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
+
+ d1 = _mm_srli_si128(d0d1, 8);
+ d3 = _mm_srli_si128(d2d3, 8);
+ d5 = _mm_srli_si128(d4d5, 8);
+ d7 = _mm_srli_si128(d6d7, 8);
+
+ lpf_internal_6_dual_sse2(&d0d1, &d5, &d1, &d4d5, &d2d3, &d3, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 8);
+ q0 = _mm_srli_si128(q1q0, 8);
+
+ transpose4x8_8x4_sse2(&p0, &p1p0, &q1q0, &q0, &d0, &d1, &d2, &d3, &d4, &d5,
+ &d6, &d7);
+
+ xx_storel_32((s - 2 + 0 * p), d0);
+ xx_storel_32((s - 2 + 1 * p), d1);
+ xx_storel_32((s - 2 + 2 * p), d2);
+ xx_storel_32((s - 2 + 3 * p), d3);
+ xx_storel_32((s - 2 + 4 * p), d4);
+ xx_storel_32((s - 2 + 5 * p), d5);
+ xx_storel_32((s - 2 + 6 * p), d6);
+ xx_storel_32((s - 2 + 7 * p), d7);
+}
+
+void aom_lpf_vertical_8_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i d0, d1, d2, d3, d4, d5, d6, d7;
+
+ __m128i p0, q0;
+ __m128i x2, x1, x0, x3;
+ __m128i q1q0, p1p0;
+ __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+ __m128i limit = _mm_load_si128((const __m128i *)_limit);
+ __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+
+ x3 = _mm_loadl_epi64((__m128i *)((s - 4) + 0 * p));
+ x2 = _mm_loadl_epi64((__m128i *)((s - 4) + 1 * p));
+ x1 = _mm_loadl_epi64((__m128i *)((s - 4) + 2 * p));
+ x0 = _mm_loadl_epi64((__m128i *)((s - 4) + 3 * p));
+
+ transpose4x8_8x4_sse2(&x3, &x2, &x1, &x0, &d0, &d1, &d2, &d3, &d4, &d5, &d6,
+ &d7);
+ // Loop filtering
+ lpf_internal_8_sse2(&d0, &d7, &d1, &d6, &d2, &d5, &d3, &d4, &q1q0, &p1p0,
+ &blimit, &limit, &thresh);
+
+ p0 = _mm_srli_si128(p1p0, 4);
+ q0 = _mm_srli_si128(q1q0, 4);
+
+ transpose8x8_low_sse2(&d0, &d1, &p0, &p1p0, &q1q0, &q0, &d6, &d7, &d0, &d1,
+ &d2, &d3);
+
+ _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0);
+ _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), d1);
+ _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2);
+ _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), d3);
+}
+
+void aom_lpf_vertical_8_dual_sse2(uint8_t *s, int p, const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0,
+ const uint8_t *_blimit1,
+ const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i blimit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_blimit0),
+ _mm_load_si128((__m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_limit0),
+ _mm_load_si128((__m128i *)_limit1));
+ __m128i thresh = _mm_unpacklo_epi32(_mm_load_si128((__m128i *)_thresh0),
+ _mm_load_si128((__m128i *)_thresh1));
+
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i d1, d3, d5, d7;
+ __m128i q1q0, p1p0;
+ __m128i p1, q1;
+ __m128i d0d1, d2d3, d4d5, d6d7;
+
+ x0 = _mm_loadl_epi64((__m128i *)(s - 4 + 0 * p));
+ x1 = _mm_loadl_epi64((__m128i *)(s - 4 + 1 * p));
+ x2 = _mm_loadl_epi64((__m128i *)(s - 4 + 2 * p));
+ x3 = _mm_loadl_epi64((__m128i *)(s - 4 + 3 * p));
+ x4 = _mm_loadl_epi64((__m128i *)(s - 4 + 4 * p));
+ x5 = _mm_loadl_epi64((__m128i *)(s - 4 + 5 * p));
+ x6 = _mm_loadl_epi64((__m128i *)(s - 4 + 6 * p));
+ x7 = _mm_loadl_epi64((__m128i *)(s - 4 + 7 * p));
+
+ transpose8x8_sse2(&x0, &x1, &x2, &x3, &x4, &x5, &x6, &x7, &d0d1, &d2d3, &d4d5,
+ &d6d7);
+
+ d1 = _mm_srli_si128(d0d1, 8);
+ d3 = _mm_srli_si128(d2d3, 8);
+ d5 = _mm_srli_si128(d4d5, 8);
+ d7 = _mm_srli_si128(d6d7, 8);
+
+ lpf_internal_8_dual_sse2(&d0d1, &d7, &d1, &d6d7, &d2d3, &d5, &d3, &d4d5,
+ &q1q0, &p1p0, &blimit, &limit, &thresh);
+
+ p1 = _mm_srli_si128(p1p0, 8);
+ q1 = _mm_srli_si128(q1q0, 8);
+
+ transpose8x8_sse2(&d0d1, &d1, &p1, &p1p0, &q1q0, &q1, &d6d7, &d7, &d0d1,
+ &d2d3, &d4d5, &d6d7);
+
+ _mm_storel_epi64((__m128i *)(s - 4 + 0 * p), d0d1);
+ _mm_storel_epi64((__m128i *)(s - 4 + 1 * p), _mm_srli_si128(d0d1, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 2 * p), d2d3);
+ _mm_storel_epi64((__m128i *)(s - 4 + 3 * p), _mm_srli_si128(d2d3, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 4 * p), d4d5);
+ _mm_storel_epi64((__m128i *)(s - 4 + 5 * p), _mm_srli_si128(d4d5, 8));
+ _mm_storel_epi64((__m128i *)(s - 4 + 6 * p), d6d7);
+ _mm_storel_epi64((__m128i *)(s - 4 + 7 * p), _mm_srli_si128(d6d7, 8));
+}
+
+void aom_lpf_vertical_14_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit,
+ const unsigned char *_limit,
+ const unsigned char *_thresh) {
+ __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x6, x5, x4, x3;
+ __m128i pq0, pq1, pq2, pq3;
+ __m128i blimit = _mm_load_si128((__m128i *)_blimit);
+ __m128i limit = _mm_load_si128((__m128i *)_limit);
+ __m128i thresh = _mm_load_si128((__m128i *)_thresh);
+
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+
+ transpose_pq_14_sse2(&x6, &x5, &x4, &x3, &q0p0, &q1p1, &q2p2, &q3p3, &q4p4,
+ &q5p5, &q6p6, &q7p7);
+
+ lpf_internal_14_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0, &blimit,
+ &limit, &thresh);
+
+ transpose_pq_14_inv_sse2(&q7p7, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &pq0, &pq1, &pq2, &pq3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), pq0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), pq1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), pq2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), pq3);
+}
+
+void aom_lpf_vertical_14_dual_sse2(
+ unsigned char *s, int p, const uint8_t *_blimit0, const uint8_t *_limit0,
+ const uint8_t *_thresh0, const uint8_t *_blimit1, const uint8_t *_limit1,
+ const uint8_t *_thresh1) {
+ __m128i q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0;
+ __m128i x7, x6, x5, x4, x3, x2, x1, x0;
+ __m128i d0d1, d2d3, d4d5, d6d7, d8d9, d10d11, d12d13, d14d15;
+ __m128i q0, q1, q2, q3, q7;
+ __m128i p0p1, p2p3, p4p5, p6p7;
+
+ __m128i blimit =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_blimit0),
+ _mm_load_si128((const __m128i *)_blimit1));
+ __m128i limit = _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_limit0),
+ _mm_load_si128((const __m128i *)_limit1));
+ __m128i thresh =
+ _mm_unpacklo_epi32(_mm_load_si128((const __m128i *)_thresh0),
+ _mm_load_si128((const __m128i *)_thresh1));
+
+ x7 = _mm_loadu_si128((__m128i *)((s - 8) + 0 * p));
+ x6 = _mm_loadu_si128((__m128i *)((s - 8) + 1 * p));
+ x5 = _mm_loadu_si128((__m128i *)((s - 8) + 2 * p));
+ x4 = _mm_loadu_si128((__m128i *)((s - 8) + 3 * p));
+ x3 = _mm_loadu_si128((__m128i *)((s - 8) + 4 * p));
+ x2 = _mm_loadu_si128((__m128i *)((s - 8) + 5 * p));
+ x1 = _mm_loadu_si128((__m128i *)((s - 8) + 6 * p));
+ x0 = _mm_loadu_si128((__m128i *)((s - 8) + 7 * p));
+
+ transpose8x16_16x8_sse2(&x7, &x6, &x5, &x4, &x3, &x2, &x1, &x0, &d0d1, &d2d3,
+ &d4d5, &d6d7, &d8d9, &d10d11, &d12d13, &d14d15);
+
+ q6p6 = _mm_unpacklo_epi64(d2d3, _mm_srli_si128(d12d13, 8));
+ q5p5 = _mm_unpacklo_epi64(d4d5, _mm_srli_si128(d10d11, 8));
+ q4p4 = _mm_unpacklo_epi64(d6d7, _mm_srli_si128(d8d9, 8));
+ q3p3 = _mm_unpacklo_epi64(d8d9, _mm_srli_si128(d6d7, 8));
+ q2p2 = _mm_unpacklo_epi64(d10d11, _mm_srli_si128(d4d5, 8));
+ q1p1 = _mm_unpacklo_epi64(d12d13, _mm_srli_si128(d2d3, 8));
+ q0p0 = _mm_unpacklo_epi64(d14d15, _mm_srli_si128(d0d1, 8));
+ q7 = _mm_srli_si128(d14d15, 8);
+
+ lpf_internal_14_dual_sse2(&q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1, &q0p0,
+ &blimit, &limit, &thresh);
+
+ x0 = _mm_srli_si128(q0p0, 8);
+ x1 = _mm_srli_si128(q1p1, 8);
+ x2 = _mm_srli_si128(q2p2, 8);
+ x3 = _mm_srli_si128(q3p3, 8);
+ x4 = _mm_srli_si128(q4p4, 8);
+ x5 = _mm_srli_si128(q5p5, 8);
+ x6 = _mm_srli_si128(q6p6, 8);
+
+ transpose16x8_8x16_sse2(&d0d1, &q6p6, &q5p5, &q4p4, &q3p3, &q2p2, &q1p1,
+ &q0p0, &x0, &x1, &x2, &x3, &x4, &x5, &x6, &q7, &p0p1,
+ &p2p3, &p4p5, &p6p7, &q0, &q1, &q2, &q3);
+
+ _mm_storeu_si128((__m128i *)(s - 8 + 0 * p), p0p1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 1 * p), p2p3);
+ _mm_storeu_si128((__m128i *)(s - 8 + 2 * p), p4p5);
+ _mm_storeu_si128((__m128i *)(s - 8 + 3 * p), p6p7);
+ _mm_storeu_si128((__m128i *)(s - 8 + 4 * p), q0);
+ _mm_storeu_si128((__m128i *)(s - 8 + 5 * p), q1);
+ _mm_storeu_si128((__m128i *)(s - 8 + 6 * p), q2);
+ _mm_storeu_si128((__m128i *)(s - 8 + 7 * p), q3);
+}
+
+static INLINE __m128i filter_add2_sub2(const __m128i *const total,
+ const __m128i *const a1,
+ const __m128i *const a2,
+ const __m128i *const s1,
+ const __m128i *const s2) {
+ __m128i x = _mm_add_epi16(*a1, *total);
+ x = _mm_add_epi16(_mm_sub_epi16(x, _mm_add_epi16(*s1, *s2)), *a2);
+ return x;
+}
+
+static INLINE __m128i filter8_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f8_lo,
+ const __m128i *const f8_hi) {
+ const __m128i f8 =
+ _mm_packus_epi16(_mm_srli_epi16(*f8_lo, 3), _mm_srli_epi16(*f8_hi, 3));
+ const __m128i result = _mm_and_si128(*flat, f8);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+static INLINE __m128i filter16_mask(const __m128i *const flat,
+ const __m128i *const other_filt,
+ const __m128i *const f_lo,
+ const __m128i *const f_hi) {
+ const __m128i f =
+ _mm_packus_epi16(_mm_srli_epi16(*f_lo, 4), _mm_srli_epi16(*f_hi, 4));
+ const __m128i result = _mm_and_si128(*flat, f);
+ return _mm_or_si128(_mm_andnot_si128(*flat, *other_filt), result);
+}
+
+void aom_lpf_horizontal_14_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat, flat2;
+ __m128i p6, p5;
+ __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i q6, q5;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+ p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+ q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(abs_diff(p4, p0), abs_diff(q4, q0));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ flat2 = _mm_max_epu8(abs_diff(p5, p0), abs_diff(q5, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ work = _mm_max_epu8(abs_diff(p6, p0), abs_diff(q6, q0));
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // wide flat calculations
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat2, zero))) {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i p6_lo = _mm_unpacklo_epi8(p6, zero);
+ const __m128i p5_lo = _mm_unpacklo_epi8(p5, zero);
+ const __m128i p4_lo = _mm_unpacklo_epi8(p4, zero);
+ const __m128i q4_lo = _mm_unpacklo_epi8(q4, zero);
+ const __m128i q5_lo = _mm_unpacklo_epi8(q5, zero);
+ const __m128i q6_lo = _mm_unpacklo_epi8(q6, zero);
+
+ const __m128i p6_hi = _mm_unpackhi_epi8(p6, zero);
+ const __m128i p5_hi = _mm_unpackhi_epi8(p5, zero);
+ const __m128i p4_hi = _mm_unpackhi_epi8(p4, zero);
+ const __m128i q4_hi = _mm_unpackhi_epi8(q4, zero);
+ const __m128i q5_hi = _mm_unpackhi_epi8(q5, zero);
+ const __m128i q6_hi = _mm_unpackhi_epi8(q6, zero);
+
+ __m128i f_lo;
+ __m128i f_hi;
+
+ f_lo = _mm_sub_epi16(_mm_slli_epi16(p6_lo, 3), p6_lo);
+ f_lo = _mm_add_epi16(_mm_slli_epi16(p5_lo, 1), f_lo);
+ f_lo = _mm_add_epi16(_mm_slli_epi16(p4_lo, 1), f_lo);
+ f_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f_lo);
+ f_lo = _mm_add_epi16(f_lo, eight);
+
+ f_hi = _mm_sub_epi16(_mm_slli_epi16(p6_hi, 3), p6_hi);
+ f_hi = _mm_add_epi16(_mm_slli_epi16(p5_hi, 1), f_hi);
+ f_hi = _mm_add_epi16(_mm_slli_epi16(p4_hi, 1), f_hi);
+ f_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f_hi);
+ f_hi = _mm_add_epi16(f_hi, eight);
+
+ p5 = filter16_mask(&flat2, &p5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 6 * p), p5);
+
+ f_lo = filter_add2_sub2(&f_lo, &q1_lo, &p3_lo, &p6_lo, &p6_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q1_hi, &p3_hi, &p6_hi, &p6_hi);
+ p4 = filter16_mask(&flat2, &p4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 5 * p), p4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q2_lo, &p2_lo, &p6_lo, &p5_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q2_hi, &p2_hi, &p6_hi, &p5_hi);
+ p3 = filter16_mask(&flat2, &p3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 4 * p), p3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q3_lo, &p1_lo, &p6_lo, &p4_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q3_hi, &p1_hi, &p6_hi, &p4_hi);
+ op2 = filter16_mask(&flat2, &op2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q4_lo, &p0_lo, &p6_lo, &p3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q4_hi, &p0_hi, &p6_hi, &p3_hi);
+ op1 = filter16_mask(&flat2, &op1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q5_lo, &q0_lo, &p6_lo, &p2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q5_hi, &q0_hi, &p6_hi, &p2_hi);
+ op0 = filter16_mask(&flat2, &op0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q1_lo, &p6_lo, &p1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q1_hi, &p6_hi, &p1_hi);
+ oq0 = filter16_mask(&flat2, &oq0, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q2_lo, &p5_lo, &p0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q2_hi, &p5_hi, &p0_hi);
+ oq1 = filter16_mask(&flat2, &oq1, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q3_lo, &p4_lo, &q0_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q3_hi, &p4_hi, &q0_hi);
+ oq2 = filter16_mask(&flat2, &oq2, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q4_lo, &p3_lo, &q1_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q4_hi, &p3_hi, &q1_hi);
+ q3 = filter16_mask(&flat2, &q3, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 3 * p), q3);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q5_lo, &p2_lo, &q2_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q5_hi, &p2_hi, &q2_hi);
+ q4 = filter16_mask(&flat2, &q4, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 4 * p), q4);
+
+ f_lo = filter_add2_sub2(&f_lo, &q6_lo, &q6_lo, &p1_lo, &q3_lo);
+ f_hi = filter_add2_sub2(&f_hi, &q6_hi, &q6_hi, &p1_hi, &q3_hi);
+ q5 = filter16_mask(&flat2, &q5, &f_lo, &f_hi);
+ _mm_storeu_si128((__m128i *)(s + 5 * p), q5);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ }
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_8_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+ __m128i op2, op1, op0, oq0, oq1, oq2;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+ q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(p3, p2));
+ mask = _mm_max_epu8(work, mask);
+ work = _mm_max_epu8(abs_diff(q2, q1), abs_diff(q3, q2));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ work = _mm_max_epu8(abs_diff(p3, p0), abs_diff(q3, q0));
+ flat = _mm_max_epu8(work, flat);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter8
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p3_lo = _mm_unpacklo_epi8(p3, zero);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+ const __m128i q3_lo = _mm_unpacklo_epi8(q3, zero);
+
+ const __m128i p3_hi = _mm_unpackhi_epi8(p3, zero);
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ const __m128i q3_hi = _mm_unpackhi_epi8(q3, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, four),
+ _mm_add_epi16(p3_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p3_lo, f8_lo),
+ _mm_add_epi16(p2_lo, p1_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, four),
+ _mm_add_epi16(p3_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p3_hi, f8_hi),
+ _mm_add_epi16(p2_hi, p1_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op2 = filter8_mask(&flat, &p2, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 3 * p), op2);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &p1_lo, &p2_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &p1_hi, &p2_hi, &p3_hi);
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &p0_lo, &p1_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &p0_hi, &p1_hi, &p3_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q0_lo, &p0_lo, &p3_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q0_hi, &p0_hi, &p3_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q1_lo, &q0_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q1_hi, &q0_hi, &p2_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q3_lo, &q2_lo, &q1_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q3_hi, &q2_hi, &q1_hi, &p1_hi);
+ oq2 = filter8_mask(&flat, &q2, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 2 * p), oq2);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_6_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi8(1);
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev, flat;
+ __m128i p2, p1, p0, q0, q1, q2;
+
+ __m128i op1, op0, oq0, oq1;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+ q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ __m128i work;
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ work = _mm_max_epu8(abs_diff(p2, p1), abs_diff(q2, q1));
+ mask = _mm_max_epu8(work, mask);
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(abs_diff(p2, p0), abs_diff(q2, q0));
+ flat = _mm_max_epu8(work, max_abs_p1p0q1q0);
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter6
+ if (0xffff != _mm_movemask_epi8(_mm_cmpeq_epi8(flat, zero))) {
+ const __m128i four = _mm_set1_epi16(4);
+ const __m128i p2_lo = _mm_unpacklo_epi8(p2, zero);
+ const __m128i p1_lo = _mm_unpacklo_epi8(p1, zero);
+ const __m128i p0_lo = _mm_unpacklo_epi8(p0, zero);
+ const __m128i q0_lo = _mm_unpacklo_epi8(q0, zero);
+ const __m128i q1_lo = _mm_unpacklo_epi8(q1, zero);
+ const __m128i q2_lo = _mm_unpacklo_epi8(q2, zero);
+
+ const __m128i p2_hi = _mm_unpackhi_epi8(p2, zero);
+ const __m128i p1_hi = _mm_unpackhi_epi8(p1, zero);
+ const __m128i p0_hi = _mm_unpackhi_epi8(p0, zero);
+ const __m128i q0_hi = _mm_unpackhi_epi8(q0, zero);
+ const __m128i q1_hi = _mm_unpackhi_epi8(q1, zero);
+ const __m128i q2_hi = _mm_unpackhi_epi8(q2, zero);
+ __m128i f8_lo, f8_hi;
+
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p2_lo, four),
+ _mm_add_epi16(p2_lo, p2_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p1_lo, f8_lo),
+ _mm_add_epi16(p1_lo, p0_lo));
+ f8_lo = _mm_add_epi16(_mm_add_epi16(p0_lo, q0_lo), f8_lo);
+
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p2_hi, four),
+ _mm_add_epi16(p2_hi, p2_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p1_hi, f8_hi),
+ _mm_add_epi16(p1_hi, p0_hi));
+ f8_hi = _mm_add_epi16(_mm_add_epi16(p0_hi, q0_hi), f8_hi);
+
+ op1 = filter8_mask(&flat, &op1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q0_lo, &q1_lo, &p2_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q0_hi, &q1_hi, &p2_hi, &p2_hi);
+ op0 = filter8_mask(&flat, &op0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q1_lo, &q2_lo, &p1_lo, &p2_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q1_hi, &q2_hi, &p1_hi, &p2_hi);
+ oq0 = filter8_mask(&flat, &oq0, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+
+ f8_lo = filter_add2_sub2(&f8_lo, &q2_lo, &q2_lo, &p0_lo, &p1_lo);
+ f8_hi = filter_add2_sub2(&f8_hi, &q2_hi, &q2_hi, &p0_hi, &p1_hi);
+ oq1 = filter8_mask(&flat, &oq1, &f8_lo, &f8_hi);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ } else {
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+ }
+}
+
+void aom_lpf_horizontal_4_quad_sse2(unsigned char *s, int p,
+ const unsigned char *_blimit0,
+ const unsigned char *_limit0,
+ const unsigned char *_thresh0) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i blimit_v = _mm_load_si128((const __m128i *)_blimit0);
+ const __m128i limit_v = _mm_load_si128((const __m128i *)_limit0);
+ const __m128i thresh_v = _mm_load_si128((const __m128i *)_thresh0);
+ __m128i mask, hev;
+ __m128i p1, p0, q0, q1;
+
+ __m128i op1, op0, oq0, oq1;
+
+ __m128i max_abs_p1p0q1q0;
+
+ p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+ p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+ q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+ q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+
+ {
+ const __m128i abs_p1p0 = abs_diff(p1, p0);
+ const __m128i abs_q1q0 = abs_diff(q1, q0);
+ const __m128i fe = _mm_set1_epi8((int8_t)0xfe);
+ const __m128i ff = _mm_cmpeq_epi8(zero, zero);
+ __m128i abs_p0q0 = abs_diff(p0, q0);
+ __m128i abs_p1q1 = abs_diff(p1, q1);
+ max_abs_p1p0q1q0 = _mm_max_epu8(abs_p1p0, abs_q1q0);
+
+ abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
+ abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+ mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit_v);
+ mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+ // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1;
+ mask = _mm_max_epu8(max_abs_p1p0q1q0, mask);
+ // mask |= (abs(p1 - p0) > limit) * -1;
+ // mask |= (abs(q1 - q0) > limit) * -1;
+ mask = _mm_subs_epu8(mask, limit_v);
+ mask = _mm_cmpeq_epi8(mask, zero);
+ }
+
+ if (0xffff == _mm_movemask_epi8(_mm_cmpeq_epi8(mask, zero))) return;
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // filter4
+ {
+ const __m128i t4 = _mm_set1_epi8(4);
+ const __m128i t3 = _mm_set1_epi8(3);
+ const __m128i t80 = _mm_set1_epi8((int8_t)0x80);
+ const __m128i te0 = _mm_set1_epi8((int8_t)0xe0);
+ const __m128i t1f = _mm_set1_epi8(0x1f);
+ const __m128i t1 = _mm_set1_epi8(0x1);
+ const __m128i t7f = _mm_set1_epi8(0x7f);
+ const __m128i ff = _mm_cmpeq_epi8(t4, t4);
+
+ __m128i filt;
+ __m128i work_a;
+ __m128i filter1, filter2;
+
+ op1 = _mm_xor_si128(p1, t80);
+ op0 = _mm_xor_si128(p0, t80);
+ oq0 = _mm_xor_si128(q0, t80);
+ oq1 = _mm_xor_si128(q1, t80);
+
+ hev = _mm_subs_epu8(max_abs_p1p0q1q0, thresh_v);
+ hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+ filt = _mm_and_si128(_mm_subs_epi8(op1, oq1), hev);
+
+ work_a = _mm_subs_epi8(oq0, op0);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_adds_epi8(filt, work_a);
+ filt = _mm_and_si128(filt, mask);
+ filter1 = _mm_adds_epi8(filt, t4);
+ filter2 = _mm_adds_epi8(filt, t3);
+
+ work_a = _mm_cmpgt_epi8(zero, filter1);
+ filter1 = _mm_srli_epi16(filter1, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter1 = _mm_and_si128(filter1, t1f);
+ filter1 = _mm_or_si128(filter1, work_a);
+ oq0 = _mm_xor_si128(_mm_subs_epi8(oq0, filter1), t80);
+
+ work_a = _mm_cmpgt_epi8(zero, filter2);
+ filter2 = _mm_srli_epi16(filter2, 3);
+ work_a = _mm_and_si128(work_a, te0);
+ filter2 = _mm_and_si128(filter2, t1f);
+ filter2 = _mm_or_si128(filter2, work_a);
+ op0 = _mm_xor_si128(_mm_adds_epi8(op0, filter2), t80);
+
+ filt = _mm_adds_epi8(filter1, t1);
+ work_a = _mm_cmpgt_epi8(zero, filt);
+ filt = _mm_srli_epi16(filt, 1);
+ work_a = _mm_and_si128(work_a, t80);
+ filt = _mm_and_si128(filt, t7f);
+ filt = _mm_or_si128(filt, work_a);
+ filt = _mm_andnot_si128(hev, filt);
+ op1 = _mm_xor_si128(_mm_adds_epi8(op1, filt), t80);
+ oq1 = _mm_xor_si128(_mm_subs_epi8(oq1, filt), t80);
+
+ _mm_storeu_si128((__m128i *)(s - 2 * p), op1);
+ _mm_storeu_si128((__m128i *)(s - 1 * p), op0);
+ _mm_storeu_si128((__m128i *)(s - 0 * p), oq0);
+ _mm_storeu_si128((__m128i *)(s + 1 * p), oq1);
+ }
+}
+
+void aom_lpf_vertical_14_quad_sse2(unsigned char *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[256]);
+
+ // Transpose 16x16
+ transpose_16x8(s - 8, s - 8 + 8 * pitch, pitch, t_dst, 16);
+ transpose_16x8(s, s + 8 * pitch, pitch, t_dst + 8 * 16, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_14_quad(t_dst + 8 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back
+ transpose_16x8(t_dst, t_dst + 8 * 16, 16, s - 8, pitch);
+ transpose_16x8(t_dst + 8, t_dst + 8 + 8 * 16, 16, s - 8 + 8 * pitch, pitch);
+}
+
+void aom_lpf_vertical_8_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_8_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_6_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8:: (wxh) 8x16 to 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_6_quad(t_dst + 4 * 16, 16, _blimit0, _limit0, _thresh0);
+
+ // Transpose back:: (wxh) 16x8 to 8x16
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
+
+void aom_lpf_vertical_4_quad_sse2(uint8_t *s, int pitch,
+ const uint8_t *_blimit0,
+ const uint8_t *_limit0,
+ const uint8_t *_thresh0) {
+ DECLARE_ALIGNED(16, unsigned char, t_dst[16 * 8]);
+
+ // Transpose 16x8
+ transpose_16x8(s - 4, s - 4 + pitch * 8, pitch, t_dst, 16);
+
+ // Loop filtering
+ aom_lpf_horizontal_4_quad_sse2(t_dst + 4 * 16, 16, _blimit0, _limit0,
+ _thresh0);
+
+ // Transpose back
+ transpose_16x8_to_8x16(t_dst, 16, s - 4, pitch);
+}
diff --git a/third_party/aom/aom_dsp/x86/lpf_common_sse2.h b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
new file mode 100644
index 0000000000..45464e80b1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/lpf_common_sse2.h
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+
+#define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
+#define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
+
+static INLINE void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5) {
+ __m128i w0, w1, w2, w3, w4, w5, ww0;
+
+ // 00 01 02 03 04 05 xx xx
+ // 10 11 12 13 14 15 xx xx
+ // 20 21 22 23 24 25 xx xx
+ // 30 31 32 33 34 35 xx xx
+ // 40 41 42 43 44 45 xx xx
+ // 50 51 52 53 54 55 xx xx
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+ w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51
+ *d1 = _mm_unpackhi_epi64(ww0,
+ _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ *d2 = _mm_unpacklo_epi64(ww0,
+ _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx
+
+ w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx
+ w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx
+ w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx
+
+ *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53
+
+ ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35
+ *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55
+ *d5 = _mm_unpackhi_epi64(ww0,
+ _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+
+ *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx
+ *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx
+ *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx
+ *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx
+}
+
+static INLINE void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, ww2, ww3;
+ __m128i zero = _mm_setzero_si128();
+
+ w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
+ w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
+
+ ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+
+ *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx
+ *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx
+ *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx
+ *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // output
+ // 00 10 20 30 xx xx xx xx
+ // 01 11 21 31 xx xx xx xx
+ // 02 12 22 32 xx xx xx xx
+ // 03 13 23 33 xx xx xx xx
+ // 04 14 24 34 xx xx xx xx
+ // 05 15 25 35 xx xx xx xx
+ // 06 16 26 36 xx xx xx xx
+ // 07 17 27 37 xx xx xx xx
+ highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
+ highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
+}
+
+static INLINE void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ __m128i w0, w1, w2, w3, ww0, ww1;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+
+ w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13
+ w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33
+ w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53
+ w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31
+ ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71
+
+ *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70
+ *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73
+
+ *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72
+ *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73
+}
+
+static INLINE void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7,
+ __m128i *d4, __m128i *d5,
+ __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, w2, w3, ww0, ww1;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17
+ w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37
+ w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57
+ w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77
+
+ ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35
+ ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75
+
+ *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74
+ *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75
+
+ ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37
+ ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77
+
+ *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76
+ *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77
+}
+
+// here in and out pointers (x and d) should be different! we don't store their
+// values inside
+static INLINE void highbd_transpose8x8_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
+ highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
+}
+
+// here in and out pointers (x and d arrays) should be different! we don't store
+// their values inside
+static INLINE void highbd_transpose8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
+ d5, d6, d7);
+ highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
+ x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
+ d4 + 1, d5 + 1, d6 + 1, d7 + 1);
+}
+
+// Low bit depth functions
+static INLINE void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
+ __m128i *x2, __m128i *x3,
+ __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ *d0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+
+ *d1 = _mm_srli_si128(*d0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(*d0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(*d0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *d0, __m128i *d1,
+ __m128i *d2, __m128i *d3, __m128i *d4,
+ __m128i *d5, __m128i *d6,
+ __m128i *d7) {
+ // input
+ // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
+ // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
+ // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
+ // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
+ // output
+ // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, ww0, ww1;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ ww0 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ ww1 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+
+ *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d1 = _mm_srli_si128(ww0,
+ 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d2 = _mm_srli_si128(ww0,
+ 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d3 = _mm_srli_si128(ww0,
+ 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
+
+ *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d5 = _mm_srli_si128(ww1,
+ 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d6 = _mm_srli_si128(ww1,
+ 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
+ *d7 = _mm_srli_si128(ww1,
+ 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
+}
+
+static INLINE void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0,
+ __m128i *d1, __m128i *d2,
+ __m128i *d3) {
+ // input
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ // output
+ // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
+ // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
+ // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
+ // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
+
+ __m128i w0, w1, w2, w3, w4, w5;
+
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d1 = _mm_srli_si128(*d0, 8);
+ *d2 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ *d3 = _mm_srli_si128(*d2, 8);
+}
+
+static INLINE void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
+ __m128i *x3, __m128i *x4, __m128i *x5,
+ __m128i *x6, __m128i *x7, __m128i *d0d1,
+ __m128i *d2d3, __m128i *d4d5,
+ __m128i *d6d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7;
+ // x0 00 01 02 03 04 05 06 07
+ // x1 10 11 12 13 14 15 16 17
+ w0 = _mm_unpacklo_epi8(
+ *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+
+ // x2 20 21 22 23 24 25 26 27
+ // x3 30 31 32 33 34 35 36 37
+ w1 = _mm_unpacklo_epi8(
+ *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+
+ // x4 40 41 42 43 44 45 46 47
+ // x5 50 51 52 53 54 55 56 57
+ w2 = _mm_unpacklo_epi8(
+ *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+
+ // x6 60 61 62 63 64 65 66 67
+ // x7 70 71 72 73 74 75 76 77
+ w3 = _mm_unpacklo_epi8(
+ *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+
+ w4 = _mm_unpacklo_epi16(
+ w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ w5 = _mm_unpacklo_epi16(
+ w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+
+ *d0d1 = _mm_unpacklo_epi32(
+ w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ *d2d3 = _mm_unpackhi_epi32(
+ w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+
+ w6 = _mm_unpackhi_epi16(
+ w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ w7 = _mm_unpackhi_epi16(
+ w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+
+ *d4d5 = _mm_unpacklo_epi32(
+ w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ *d6d7 = _mm_unpackhi_epi32(
+ w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+}
+
+static INLINE void transpose16x8_8x16_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
+ __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
+ __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
+ __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpacklo_epi8(*x8, *x9);
+ w9 = _mm_unpacklo_epi8(*x10, *x11);
+ w10 = _mm_unpacklo_epi8(*x12, *x13);
+ w11 = _mm_unpacklo_epi8(*x14, *x15);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0 = _mm_unpacklo_epi64(w6, w14);
+ *d1 = _mm_unpackhi_epi64(w6, w14);
+ *d2 = _mm_unpacklo_epi64(w7, w15);
+ *d3 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d4 = _mm_unpacklo_epi64(w6, w14);
+ *d5 = _mm_unpackhi_epi64(w6, w14);
+ *d6 = _mm_unpacklo_epi64(w7, w15);
+ *d7 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose8x16_16x8_sse2(
+ __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
+ __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
+ __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
+ __m128i *d12d13, __m128i *d14d15) {
+ __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
+ __m128i w10, w11, w12, w13, w14, w15;
+
+ w0 = _mm_unpacklo_epi8(*x0, *x1);
+ w1 = _mm_unpacklo_epi8(*x2, *x3);
+ w2 = _mm_unpacklo_epi8(*x4, *x5);
+ w3 = _mm_unpacklo_epi8(*x6, *x7);
+
+ w8 = _mm_unpackhi_epi8(*x0, *x1);
+ w9 = _mm_unpackhi_epi8(*x2, *x3);
+ w10 = _mm_unpackhi_epi8(*x4, *x5);
+ w11 = _mm_unpackhi_epi8(*x6, *x7);
+
+ w4 = _mm_unpacklo_epi16(w0, w1);
+ w5 = _mm_unpacklo_epi16(w2, w3);
+ w12 = _mm_unpacklo_epi16(w8, w9);
+ w13 = _mm_unpacklo_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store first 4-line result
+ *d0d1 = _mm_unpacklo_epi64(w6, w14);
+ *d2d3 = _mm_unpackhi_epi64(w6, w14);
+ *d4d5 = _mm_unpacklo_epi64(w7, w15);
+ *d6d7 = _mm_unpackhi_epi64(w7, w15);
+
+ w4 = _mm_unpackhi_epi16(w0, w1);
+ w5 = _mm_unpackhi_epi16(w2, w3);
+ w12 = _mm_unpackhi_epi16(w8, w9);
+ w13 = _mm_unpackhi_epi16(w10, w11);
+
+ w6 = _mm_unpacklo_epi32(w4, w5);
+ w7 = _mm_unpackhi_epi32(w4, w5);
+ w14 = _mm_unpacklo_epi32(w12, w13);
+ w15 = _mm_unpackhi_epi32(w12, w13);
+
+ // Store second 4-line result
+ *d8d9 = _mm_unpacklo_epi64(w6, w14);
+ *d10d11 = _mm_unpackhi_epi64(w6, w14);
+ *d12d13 = _mm_unpacklo_epi64(w7, w15);
+ *d14d15 = _mm_unpackhi_epi64(w7, w15);
+}
+
+static INLINE void transpose_16x8(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ __m128i x8, x9, x10, x11, x12, x13, x14, x15;
+
+ x0 = _mm_loadl_epi64((__m128i *)in0);
+ x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
+ x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
+ x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
+ x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
+ x3 = _mm_unpacklo_epi8(x6, x7);
+ x4 = _mm_unpacklo_epi16(x0, x1);
+
+ x8 = _mm_loadl_epi64((__m128i *)in1);
+ x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
+ x8 = _mm_unpacklo_epi8(x8, x9);
+ x5 = _mm_unpacklo_epi16(x2, x3);
+
+ x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
+ x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
+ x9 = _mm_unpacklo_epi8(x10, x11);
+
+ x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
+ x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
+ x10 = _mm_unpacklo_epi8(x12, x13);
+ x12 = _mm_unpacklo_epi16(x8, x9);
+
+ x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
+ x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
+ x11 = _mm_unpacklo_epi8(x14, x15);
+ x13 = _mm_unpacklo_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store first 4-line result
+ _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
+
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ x12 = _mm_unpackhi_epi16(x8, x9);
+ x13 = _mm_unpackhi_epi16(x10, x11);
+
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ x14 = _mm_unpacklo_epi32(x12, x13);
+ x15 = _mm_unpackhi_epi32(x12, x13);
+
+ // Store second 4-line result
+ _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
+ _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
+ _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
+}
+
+static INLINE void transpose_16x8_to_8x16(unsigned char *src, int in_p,
+ unsigned char *dst, int out_p) {
+ // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
+ // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
+ // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
+ // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
+ // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
+ // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
+ // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
+ // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
+ const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
+ const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
+ const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
+ const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
+ const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
+ const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
+ const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
+ const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
+
+ // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
+ // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
+ // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
+ // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
+ // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
+ // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
+ // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
+ // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
+ const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
+ const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
+ const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
+ const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
+ const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
+ const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
+ const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
+ const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
+
+ // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
+ // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
+ // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
+ // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
+ // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
+ // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
+ // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
+ // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
+ const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
+ const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
+ const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
+ const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
+ const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
+ const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
+ const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
+ const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
+
+ // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
+ // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
+ // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
+ // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
+ // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
+ // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
+ // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
+ // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
+ const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
+ const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
+ const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
+ const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
+ const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
+ const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
+ const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
+ const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
+
+ mm_storelu(dst, x_s30);
+ mm_storehu(dst + (1 * out_p), x_s30);
+ mm_storelu(dst + (2 * out_p), x_s31);
+ mm_storehu(dst + (3 * out_p), x_s31);
+ mm_storelu(dst + (4 * out_p), x_s32);
+ mm_storehu(dst + (5 * out_p), x_s32);
+ mm_storelu(dst + (6 * out_p), x_s33);
+ mm_storehu(dst + (7 * out_p), x_s33);
+ mm_storelu(dst + (8 * out_p), x_s34);
+ mm_storehu(dst + (9 * out_p), x_s34);
+ mm_storelu(dst + (10 * out_p), x_s35);
+ mm_storehu(dst + (11 * out_p), x_s35);
+ mm_storelu(dst + (12 * out_p), x_s36);
+ mm_storehu(dst + (13 * out_p), x_s36);
+ mm_storelu(dst + (14 * out_p), x_s37);
+ mm_storehu(dst + (15 * out_p), x_s37);
+}
+
+static INLINE void transpose_8xn(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
+ int idx8x8 = 0;
+ __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+ do {
+ unsigned char *in = src[idx8x8];
+ unsigned char *out = dst[idx8x8];
+
+ x0 =
+ _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07
+ x1 =
+ _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ x0 = _mm_unpacklo_epi8(x0, x1);
+
+ x2 =
+ _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27
+ x3 =
+ _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ x1 = _mm_unpacklo_epi8(x2, x3);
+
+ x4 =
+ _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47
+ x5 =
+ _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ x2 = _mm_unpacklo_epi8(x4, x5);
+
+ x6 =
+ _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67
+ x7 =
+ _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ x3 = _mm_unpacklo_epi8(x6, x7);
+
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ x4 = _mm_unpacklo_epi16(x0, x1);
+ // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ x5 = _mm_unpacklo_epi16(x2, x3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70
+ mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ x7 = _mm_unpackhi_epi32(x4, x5);
+ mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72
+ mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73
+
+ // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ x4 = _mm_unpackhi_epi16(x0, x1);
+ // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ x5 = _mm_unpackhi_epi16(x2, x3);
+ // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ x6 = _mm_unpacklo_epi32(x4, x5);
+ mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74
+ mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ x7 = _mm_unpackhi_epi32(x4, x5);
+
+ mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76
+ mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77
+ } while (++idx8x8 < num_8x8_to_transpose);
+}
+
+#endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
new file mode 100644
index 0000000000..799ce9ef44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad4d_ssse3.c
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+#define MASK_SAD16XH_ONE_REF(idx) \
+ a = _mm_loadu_si128((const __m128i *)&ref##idx[x]); \
+ data_l = _mm_unpacklo_epi8(a, b); \
+ mask_l = _mm_unpacklo_epi8(m, m_inv); \
+ pred_l = _mm_maddubs_epi16(data_l, mask_l); \
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ data_r = _mm_unpackhi_epi8(a, b); \
+ mask_r = _mm_unpackhi_epi8(m, m_inv); \
+ pred_r = _mm_maddubs_epi16(data_r, mask_r); \
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred_l, pred_r); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+static INLINE void masked_sadx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, int inv_mask,
+ unsigned sad_array[4]) {
+ int x, y;
+ __m128i a;
+ __m128i data_l, data_r, mask_l, mask_r, pred_l, pred_r, pred;
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ const uint8_t *ref0 = a_ptr[0];
+ const uint8_t *ref1 = a_ptr[1];
+ const uint8_t *ref2 = a_ptr[2];
+ const uint8_t *ref3 = a_ptr[3];
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m_copy = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD16XH_ONE_REF(0)
+ MASK_SAD16XH_ONE_REF(1)
+ MASK_SAD16XH_ONE_REF(2)
+ MASK_SAD16XH_ONE_REF(3)
+ }
+
+ src_ptr += src_stride;
+ ref0 += a_stride;
+ ref1 += a_stride;
+ ref2 += a_stride;
+ ref3 += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+ _mm_unpackhi_epi32(res0, res1));
+ res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+ _mm_unpackhi_epi32(res2, res3));
+
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD8XH_ONE_REF(idx) \
+ const __m128i a##idx##0 = _mm_loadl_epi64((__m128i *)ref##idx); \
+ const __m128i a##idx##1 = _mm_loadl_epi64((__m128i *)(ref##idx + a_stride)); \
+ data_l = _mm_unpacklo_epi8(a##idx##0, b0); \
+ mask_l = _mm_unpacklo_epi8(m, m_inv); \
+ pred_l = _mm_maddubs_epi16(data_l, mask_l); \
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ data_r = _mm_unpacklo_epi8(a##idx##1, b1); \
+ mask_r = _mm_unpackhi_epi8(m, m_inv); \
+ pred_r = _mm_maddubs_epi16(data_r, mask_r); \
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred_l, pred_r); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad8xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height,
+ int inv_mask, unsigned sad_array[4]) {
+ const uint8_t *ref0 = ref_array[0];
+ const uint8_t *ref1 = ref_array[1];
+ const uint8_t *ref2 = ref_array[2];
+ const uint8_t *ref3 = ref_array[3];
+ __m128i data_l, data_r, pred_l, pred_r, mask_l, mask_r, pred;
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)));
+ const __m128i b0 = _mm_loadl_epi64((__m128i *)b_ptr);
+ const __m128i b1 = _mm_loadl_epi64((__m128i *)(b_ptr + b_stride));
+ const __m128i m0 = _mm_loadl_epi64((__m128i *)m_ptr);
+ const __m128i m1 = _mm_loadl_epi64((__m128i *)(m_ptr + m_stride));
+ __m128i m_copy = _mm_unpacklo_epi64(m0, m1);
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD8XH_ONE_REF(0)
+ MASK_SAD8XH_ONE_REF(1)
+ MASK_SAD8XH_ONE_REF(2)
+ MASK_SAD8XH_ONE_REF(3)
+
+ ref0 += 2 * a_stride;
+ ref1 += 2 * a_stride;
+ ref2 += 2 * a_stride;
+ ref3 += 2 * a_stride;
+ src_ptr += 2 * src_stride;
+ b_ptr += 2 * b_stride;
+ m_ptr += 2 * m_stride;
+ }
+ res0 = _mm_add_epi32(_mm_unpacklo_epi32(res0, res1),
+ _mm_unpackhi_epi32(res0, res1));
+ res2 = _mm_add_epi32(_mm_unpacklo_epi32(res2, res3),
+ _mm_unpackhi_epi32(res2, res3));
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASK_SAD4XH_ONE_REF(idx) \
+ a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)ref##idx), \
+ _mm_cvtsi32_si128(*(int *)&ref##idx[a_stride])); \
+ data = _mm_unpacklo_epi8(a, b); \
+ mask = _mm_unpacklo_epi8(m, m_inv); \
+ pred = _mm_maddubs_epi16(data, mask); \
+ pred = xx_roundn_epu16(pred, AOM_BLEND_A64_ROUND_BITS); \
+ \
+ pred = _mm_packus_epi16(pred, _mm_setzero_si128()); \
+ res##idx = _mm_add_epi32(res##idx, _mm_sad_epu8(pred, src));
+
+void aom_masked_sad4xhx4d_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_array[4], int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int height,
+ int inv_mask, unsigned sad_array[4]) {
+ const uint8_t *ref0 = ref_array[0];
+ const uint8_t *ref1 = ref_array[1];
+ const uint8_t *ref2 = ref_array[2];
+ const uint8_t *ref3 = ref_array[3];
+ __m128i data, pred, mask;
+ __m128i res0 = _mm_setzero_si128();
+ __m128i res1 = _mm_setzero_si128();
+ __m128i res2 = _mm_setzero_si128();
+ __m128i res3 = _mm_setzero_si128();
+ __m128i a;
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (int y = 0; y < height; y += 2) {
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+ const __m128i m_copy =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+
+ __m128i m_inv = _mm_sub_epi8(mask_max, m_copy);
+ __m128i m = inv_mask ? m_inv : m_copy;
+ m_inv = inv_mask ? m_copy : m_inv;
+
+ MASK_SAD4XH_ONE_REF(0)
+ MASK_SAD4XH_ONE_REF(1)
+ MASK_SAD4XH_ONE_REF(2)
+ MASK_SAD4XH_ONE_REF(3)
+
+ ref0 += 2 * a_stride;
+ ref1 += 2 * a_stride;
+ ref2 += 2 * a_stride;
+ ref3 += 2 * a_stride;
+ src_ptr += 2 * src_stride;
+ b_ptr += 2 * b_stride;
+ m_ptr += 2 * m_stride;
+ }
+ res0 = _mm_unpacklo_epi32(res0, res1);
+ res2 = _mm_unpacklo_epi32(res2, res3);
+ res0 = _mm_unpacklo_epi64(res0, res2);
+ _mm_storeu_si128((__m128i *)sad_array, res0);
+}
+
+#define MASKSADMXN_SSSE3(m, n) \
+ void aom_masked_sad##m##x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ masked_sadx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, m, msk, \
+ msk_stride, m, n, inv_mask, sad_array); \
+ }
+
+#define MASKSAD8XN_SSSE3(n) \
+ void aom_masked_sad8x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ aom_masked_sad8xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ 8, msk, msk_stride, n, inv_mask, sad_array); \
+ }
+
+#define MASKSAD4XN_SSSE3(n) \
+ void aom_masked_sad4x##n##x4d_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref[4], \
+ int ref_stride, const uint8_t *second_pred, const uint8_t *msk, \
+ int msk_stride, int inv_mask, unsigned sad_array[4]) { \
+ aom_masked_sad4xhx4d_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ 4, msk, msk_stride, n, inv_mask, sad_array); \
+ }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
new file mode 100644
index 0000000000..2c022555b5
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+static INLINE unsigned int masked_sad32xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 32) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ const __m256i m = _mm256_lddqu_si256((const __m256i *)&m_ptr[x]);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return sad;
+}
+
+static INLINE __m256i xx_loadu2_m128i(const void *hi, const void *lo) {
+ __m128i a0 = _mm_lddqu_si128((const __m128i *)(lo));
+ __m128i a1 = _mm_lddqu_si128((const __m128i *)(hi));
+ __m256i a = _mm256_castsi128_si256(a0);
+ return _mm256_inserti128_si256(a, a1, 1);
+}
+
+static INLINE unsigned int masked_sad16xh_avx2(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_scale =
+ _mm256_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ const __m256i m = xx_loadu2_m128i(m_ptr + m_stride, m_ptr);
+ const __m256i m_inv = _mm256_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m256i data_l = _mm256_unpacklo_epi8(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi8(m, m_inv);
+ __m256i pred_l = _mm256_maddubs_epi16(data_l, mask_l);
+ pred_l = _mm256_mulhrs_epi16(pred_l, round_scale);
+
+ const __m256i data_r = _mm256_unpackhi_epi8(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi8(m, m_inv);
+ __m256i pred_r = _mm256_maddubs_epi16(data_r, mask_r);
+ pred_r = _mm256_mulhrs_epi16(pred_r, round_scale);
+
+ const __m256i pred = _mm256_packus_epi16(pred_l, pred_r);
+ res = _mm256_add_epi32(res, _mm256_sad_epu8(pred, src));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ res = _mm256_shuffle_epi32(res, 0xd8);
+ res = _mm256_permute4x64_epi64(res, 0xd8);
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int32_t sad = _mm256_extract_epi32(res, 0);
+ return sad;
+}
+
+static INLINE unsigned int aom_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, ref, ref_stride, second_pred,
+ m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad = aom_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = aom_masked_sad8xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 16:
+ sad = masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = masked_sad32xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ return aom_masked_sad_avx2(src, src_stride, ref, ref_stride, second_pred, \
+ msk, msk_stride, invert_mask, m, n); \
+ }
+
+MASKSADMXN_AVX2(4, 4)
+MASKSADMXN_AVX2(4, 8)
+MASKSADMXN_AVX2(8, 4)
+MASKSADMXN_AVX2(8, 8)
+MASKSADMXN_AVX2(8, 16)
+MASKSADMXN_AVX2(16, 8)
+MASKSADMXN_AVX2(16, 16)
+MASKSADMXN_AVX2(16, 32)
+MASKSADMXN_AVX2(32, 16)
+MASKSADMXN_AVX2(32, 32)
+MASKSADMXN_AVX2(32, 64)
+MASKSADMXN_AVX2(64, 32)
+MASKSADMXN_AVX2(64, 64)
+MASKSADMXN_AVX2(64, 128)
+MASKSADMXN_AVX2(128, 64)
+MASKSADMXN_AVX2(128, 128)
+MASKSADMXN_AVX2(4, 16)
+MASKSADMXN_AVX2(16, 4)
+MASKSADMXN_AVX2(8, 32)
+MASKSADMXN_AVX2(32, 8)
+MASKSADMXN_AVX2(16, 64)
+MASKSADMXN_AVX2(64, 16)
+
+static INLINE unsigned int highbd_masked_sad8xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y += 2) {
+ const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr);
+ const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr);
+ const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr);
+ // Zero-extend mask to 16 bits
+ const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)(m_ptr)),
+ _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride))));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+
+ src_ptr += src_stride << 1;
+ a_ptr += a_stride << 1;
+ b_ptr += b_stride << 1;
+ m_ptr += m_stride << 1;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return sad;
+}
+
+static INLINE unsigned int highbd_masked_sad16xh_avx2(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int x, y;
+ __m256i res = _mm256_setzero_si256();
+ const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i one = _mm256_set1_epi16(1);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
+ const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
+ const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
+ // Zero-extend mask to 16 bits
+ const __m256i m =
+ _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
+ const __m256i m_inv = _mm256_sub_epi16(mask_max, m);
+
+ const __m256i data_l = _mm256_unpacklo_epi16(a, b);
+ const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
+ __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
+ pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i data_r = _mm256_unpackhi_epi16(a, b);
+ const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
+ __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
+ pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
+ res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm256_hadd_epi32(res, res);
+ res = _mm256_hadd_epi32(res, res);
+ int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
+ return sad;
+}
+
+static INLINE unsigned int aom_highbd_masked_sad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride,
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride,
+ int invert_mask, int m, int n) {
+ unsigned int sad;
+ if (!invert_mask) {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, ref, ref_stride,
+ second_pred, m, msk, msk_stride, m, n);
+ break;
+ }
+ } else {
+ switch (m) {
+ case 4:
+ sad =
+ aom_highbd_masked_sad4xh_ssse3(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ case 8:
+ sad = highbd_masked_sad8xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, n);
+ break;
+ default:
+ sad = highbd_masked_sad16xh_avx2(src, src_stride, second_pred, m, ref,
+ ref_stride, msk, msk_stride, m, n);
+ break;
+ }
+ }
+ return sad;
+}
+
+#define HIGHBD_MASKSADMXN_AVX2(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_avx2( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ return aom_highbd_masked_sad_avx2(src8, src_stride, ref8, ref_stride, \
+ second_pred8, msk, msk_stride, \
+ invert_mask, m, n); \
+ }
+
+HIGHBD_MASKSADMXN_AVX2(4, 4)
+HIGHBD_MASKSADMXN_AVX2(4, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 8)
+HIGHBD_MASKSADMXN_AVX2(8, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 16)
+HIGHBD_MASKSADMXN_AVX2(32, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 32)
+HIGHBD_MASKSADMXN_AVX2(64, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 128)
+HIGHBD_MASKSADMXN_AVX2(128, 64)
+HIGHBD_MASKSADMXN_AVX2(128, 128)
+HIGHBD_MASKSADMXN_AVX2(4, 16)
+HIGHBD_MASKSADMXN_AVX2(16, 4)
+HIGHBD_MASKSADMXN_AVX2(8, 32)
+HIGHBD_MASKSADMXN_AVX2(32, 8)
+HIGHBD_MASKSADMXN_AVX2(16, 64)
+HIGHBD_MASKSADMXN_AVX2(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 0000000000..df3a8764e3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,400 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdio.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#include "aom_dsp/x86/masked_sad_intrin_ssse3.h"
+
+// For width a multiple of 16
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height);
+
+#define MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \
+ m, msk, msk_stride, m, n); \
+ else \
+ return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \
+ ref_stride, msk, msk_stride, m, n); \
+ }
+
+#define MASKSAD8XN_SSSE3(n) \
+ unsigned int aom_masked_sad8x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 8, msk, msk_stride, n); \
+ else \
+ return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \
+ ref_stride, msk, msk_stride, n); \
+ }
+
+#define MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_masked_sad4x##n##_ssse3( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \
+ int invert_mask) { \
+ if (!invert_mask) \
+ return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \
+ second_pred, 4, msk, msk_stride, n); \
+ else \
+ return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \
+ ref_stride, msk, msk_stride, n); \
+ }
+
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+MASKSAD4XN_SSSE3(16)
+MASKSADMXN_SSSE3(16, 4)
+MASKSAD8XN_SSSE3(32)
+MASKSADMXN_SSSE3(32, 8)
+MASKSADMXN_SSSE3(16, 64)
+MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
+ int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ int x, y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m128i data_l = _mm_unpacklo_epi8(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi8(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
+ unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+ _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+ return sad;
+}
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
+ const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
+ const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
+ const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
+ const __m128i m =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+ _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
+ const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
+ const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) +
+ _mm_cvtsi128_si32(_mm_srli_si128(res, 8)));
+ return sad;
+}
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+
+ for (y = 0; y < height; y += 2) {
+ // Load two rows at a time, this seems to be a bit faster
+ // than four rows at a time in this case.
+ const __m128i src =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr),
+ _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride]));
+ const __m128i a =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr),
+ _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr),
+ _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride]));
+ const __m128i m =
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr),
+ _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride]));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, m);
+
+ const __m128i data = _mm_unpacklo_epi8(a, b);
+ const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
+ __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
+ pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
+ res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ // At this point, the SAD is stored in lane 0 of 'res'
+ return (unsigned int)_mm_cvtsi128_si32(res);
+}
+
+// For width a multiple of 8
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
+ unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ if (!invert_mask) \
+ return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \
+ second_pred8, m, msk, msk_stride, m, n); \
+ else \
+ return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \
+ ref_stride, msk, msk_stride, m, n); \
+ }
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n) \
+ unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
+ const uint8_t *src8, int src_stride, const uint8_t *ref8, \
+ int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \
+ int msk_stride, int invert_mask) { \
+ if (!invert_mask) \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \
+ ref_stride, second_pred8, 4, msk, \
+ msk_stride, n); \
+ else \
+ return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \
+ ref8, ref_stride, msk, msk_stride, \
+ n); \
+ }
+
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+HIGHBD_MASKSAD4XN_SSSE3(16)
+HIGHBD_MASKSADMXN_SSSE3(16, 4)
+HIGHBD_MASKSADMXN_SSSE3(8, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 8)
+HIGHBD_MASKSADMXN_SSSE3(16, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 16)
+
+static INLINE unsigned int highbd_masked_sad_ssse3(
+ const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
+ int width, int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int x, y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 8) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ // Zero-extend mask to 16 bits
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128());
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
+ // so it is safe to do signed saturation here.
+ const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+ // There is no 16-bit SAD instruction, so we have to synthesize
+ // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
+ // and accumulating them at the end
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+ res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // At this point, we have four 32-bit partial SADs stored in 'res'.
+ res = _mm_hadd_epi32(res, res);
+ res = _mm_hadd_epi32(res, res);
+ int sad = _mm_cvtsi128_si32(res);
+ return sad;
+}
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height) {
+ const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
+ const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
+ const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
+ int y;
+ __m128i res = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i one = _mm_set1_epi16(1);
+
+ for (y = 0; y < height; y += 2) {
+ const __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr),
+ _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]));
+ const __m128i b =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr),
+ _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]));
+ // Zero-extend mask to 16 bits
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+ _mm_setzero_si128());
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i pred = _mm_packs_epi32(pred_l, pred_r);
+ const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src));
+ res = _mm_add_epi32(res, _mm_madd_epi16(diff, one));
+
+ src_ptr += src_stride * 2;
+ a_ptr += a_stride * 2;
+ b_ptr += b_stride * 2;
+ m_ptr += m_stride * 2;
+ }
+ res = _mm_hadd_epi32(res, res);
+ res = _mm_hadd_epi32(res, res);
+ int sad = _mm_cvtsi128_si32(res);
+ return sad;
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
new file mode 100644
index 0000000000..cffbd9672c
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_sad_intrin_ssse3.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
+
+unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride,
+ const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int height);
+
+#endif // AOM_AOM_DSP_X86_MASKED_SAD_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 0000000000..0bf383fffd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,1067 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+// For width a multiple of 16
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int w, int h);
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h);
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h);
+
+// For width a multiple of 16
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width,
+ int height, unsigned int *sse, int *sum_);
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_);
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_);
+
+#define MASK_SUBPIX_VAR_SSSE3(W, H) \
+ unsigned int aom_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * W]; \
+ \
+ bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, sse, &sum); \
+ else \
+ masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define MASK_SUBPIX_VAR8XH_SSSE3(H) \
+ unsigned int aom_masked_sub_pixel_variance8x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * 8]; \
+ \
+ bilinear_filter8xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ masked_variance8xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+ H, sse, &sum); \
+ else \
+ masked_variance8xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+ H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (8 * H)); \
+ }
+
+#define MASK_SUBPIX_VAR4XH_SSSE3(H) \
+ unsigned int aom_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \
+ const uint8_t *msk, int msk_stride, int invert_mask, \
+ unsigned int *sse) { \
+ int sum; \
+ uint8_t temp[(H + 1) * 4]; \
+ \
+ bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ masked_variance4xh(ref, ref_stride, temp, second_pred, msk, msk_stride, \
+ H, sse, &sum); \
+ else \
+ masked_variance4xh(ref, ref_stride, second_pred, temp, msk, msk_stride, \
+ H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
+ }
+
+MASK_SUBPIX_VAR_SSSE3(128, 128)
+MASK_SUBPIX_VAR_SSSE3(128, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 128)
+MASK_SUBPIX_VAR_SSSE3(64, 64)
+MASK_SUBPIX_VAR_SSSE3(64, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 64)
+MASK_SUBPIX_VAR_SSSE3(32, 32)
+MASK_SUBPIX_VAR_SSSE3(32, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 32)
+MASK_SUBPIX_VAR_SSSE3(16, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 8)
+MASK_SUBPIX_VAR8XH_SSSE3(16)
+MASK_SUBPIX_VAR8XH_SSSE3(8)
+MASK_SUBPIX_VAR8XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(8)
+MASK_SUBPIX_VAR4XH_SSSE3(4)
+MASK_SUBPIX_VAR4XH_SSSE3(16)
+MASK_SUBPIX_VAR_SSSE3(16, 4)
+MASK_SUBPIX_VAR8XH_SSSE3(32)
+MASK_SUBPIX_VAR_SSSE3(32, 8)
+MASK_SUBPIX_VAR_SSSE3(64, 16)
+MASK_SUBPIX_VAR_SSSE3(16, 64)
+
+static INLINE __m128i filter_block(const __m128i a, const __m128i b,
+ const __m128i filter) {
+ __m128i v0 = _mm_unpacklo_epi8(a, b);
+ v0 = _mm_maddubs_epi16(v0, filter);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpackhi_epi8(a, b);
+ v1 = _mm_maddubs_epi16(v1, filter);
+ v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+ return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int w, int h) {
+ int i, j;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ _mm_storeu_si128((__m128i *)&b[j], x);
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+ __m128i z = _mm_alignr_epi8(y, x, 1);
+ _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu8(x, z));
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 16]);
+ const __m128i z = _mm_alignr_epi8(y, x, 1);
+ const __m128i res = filter_block(x, z, hfilter_vec);
+ _mm_storeu_si128((__m128i *)&b[j], res);
+ }
+
+ src += src_stride;
+ b += w;
+ }
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu8(x, y));
+ }
+ dst += w;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 16) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ const __m128i res = filter_block(x, y, vfilter_vec);
+ _mm_storeu_si128((__m128i *)&dst[j], res);
+ }
+
+ dst += w;
+ }
+ }
+}
+
+static INLINE __m128i filter_block_2rows(const __m128i *a0, const __m128i *b0,
+ const __m128i *a1, const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi8(*a0, *b0);
+ v0 = _mm_maddubs_epi16(v0, *filter);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpacklo_epi8(*a1, *b1);
+ v1 = _mm_maddubs_epi16(v1, *filter);
+ v1 = xx_roundn_epu16(v1, FILTER_BITS);
+
+ return _mm_packus_epi16(v0, v1);
+}
+
+static void bilinear_filter8xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ _mm_storel_epi64((__m128i *)b, x);
+ src += src_stride;
+ b += 8;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 1);
+ _mm_storel_epi64((__m128i *)b, _mm_avg_epu8(x, z));
+ src += src_stride;
+ b += 8;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 1);
+ const __m128i res = filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 2;
+ b += 16;
+ }
+ // Handle i = h separately
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+
+ __m128i v0 = _mm_unpacklo_epi8(x0, z0);
+ v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ _mm_storel_epi64((__m128i *)b, _mm_packus_epi16(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu8(x, y));
+ dst += 8;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ const __m128i y = _mm_loadl_epi64((__m128i *)&dst[8]);
+ const __m128i z = _mm_loadl_epi64((__m128i *)&dst[16]);
+ const __m128i res = filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 16;
+ }
+ }
+}
+
+static void bilinear_filter4xh(const uint8_t *src, int src_stride, int xoffset,
+ int yoffset, uint8_t *dst, int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = xx_loadl_32((__m128i *)src);
+ xx_storel_32(b, x);
+ src += src_stride;
+ b += 4;
+ }
+ } else if (xoffset == 4) {
+ uint8_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 1);
+ xx_storel_32(b, _mm_avg_epu8(x, z));
+ src += src_stride;
+ b += 4;
+ }
+ } else {
+ uint8_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi16(hfilter[0] | (hfilter[1] << 8));
+ for (i = 0; i < h; i += 4) {
+ const __m128i x0 = _mm_loadl_epi64((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 1);
+ const __m128i x1 = _mm_loadl_epi64((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 1);
+ const __m128i x2 = _mm_loadl_epi64((__m128i *)&src[src_stride * 2]);
+ const __m128i z2 = _mm_srli_si128(x2, 1);
+ const __m128i x3 = _mm_loadl_epi64((__m128i *)&src[src_stride * 3]);
+ const __m128i z3 = _mm_srli_si128(x3, 1);
+
+ const __m128i a0 = _mm_unpacklo_epi32(x0, x1);
+ const __m128i b0 = _mm_unpacklo_epi32(z0, z1);
+ const __m128i a1 = _mm_unpacklo_epi32(x2, x3);
+ const __m128i b1 = _mm_unpacklo_epi32(z2, z3);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 4;
+ b += 16;
+ }
+ // Handle i = h separately
+ const __m128i x = _mm_loadl_epi64((__m128i *)src);
+ const __m128i z = _mm_srli_si128(x, 1);
+
+ __m128i v0 = _mm_unpacklo_epi8(x, z);
+ v0 = _mm_maddubs_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu16(v0, FILTER_BITS);
+
+ xx_storel_32(b, _mm_packus_epi16(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = xx_loadl_32((__m128i *)dst);
+ __m128i y = xx_loadl_32((__m128i *)&dst[4]);
+ xx_storel_32(dst, _mm_avg_epu8(x, y));
+ dst += 4;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi16(vfilter[0] | (vfilter[1] << 8));
+ for (i = 0; i < h; i += 4) {
+ const __m128i a = xx_loadl_32((__m128i *)dst);
+ const __m128i b = xx_loadl_32((__m128i *)&dst[4]);
+ const __m128i c = xx_loadl_32((__m128i *)&dst[8]);
+ const __m128i d = xx_loadl_32((__m128i *)&dst[12]);
+ const __m128i e = xx_loadl_32((__m128i *)&dst[16]);
+
+ const __m128i a0 = _mm_unpacklo_epi32(a, b);
+ const __m128i b0 = _mm_unpacklo_epi32(b, c);
+ const __m128i a1 = _mm_unpacklo_epi32(c, d);
+ const __m128i b1 = _mm_unpacklo_epi32(d, e);
+ const __m128i res = filter_block_2rows(&a0, &b0, &a1, &b1, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 16;
+ }
+ }
+}
+
+static INLINE void accumulate_block(const __m128i *src, const __m128i *a,
+ const __m128i *b, const __m128i *m,
+ __m128i *sum, __m128i *sum_sq) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i m_inv = _mm_sub_epi8(mask_max, *m);
+
+ // Calculate 16 predicted pixels.
+ // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
+ // is 64 * 255, so we have plenty of space to add rounding constants.
+ const __m128i data_l = _mm_unpacklo_epi8(*a, *b);
+ const __m128i mask_l = _mm_unpacklo_epi8(*m, m_inv);
+ __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
+ pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi8(*a, *b);
+ const __m128i mask_r = _mm_unpackhi_epi8(*m, m_inv);
+ __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
+ pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi8(*src, zero);
+ const __m128i src_r = _mm_unpackhi_epi8(*src, zero);
+ const __m128i diff_l = _mm_sub_epi16(pred_l, src_l);
+ const __m128i diff_r = _mm_sub_epi16(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ *sum =
+ _mm_add_epi32(*sum, _mm_madd_epi16(_mm_add_epi16(diff_l, diff_r), one));
+ *sum_sq =
+ _mm_add_epi32(*sum_sq, _mm_add_epi32(_mm_madd_epi16(diff_l, diff_l),
+ _mm_madd_epi16(diff_r, diff_r)));
+}
+
+static void masked_variance(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, int a_stride,
+ const uint8_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride, int width,
+ int height, unsigned int *sse, int *sum_) {
+ int x, y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 16) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance8xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_) {
+ int y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 2) {
+ __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m =
+ _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
+ _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+ src_ptr += src_stride * 2;
+ a_ptr += 16;
+ b_ptr += 16;
+ m_ptr += m_stride * 2;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+static void masked_variance4xh(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *a_ptr, const uint8_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride, int height,
+ unsigned int *sse, int *sum_) {
+ int y;
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 4) {
+ // Load four rows at a time
+ __m128i src = _mm_setr_epi32(*(int *)src_ptr, *(int *)&src_ptr[src_stride],
+ *(int *)&src_ptr[src_stride * 2],
+ *(int *)&src_ptr[src_stride * 3]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m = _mm_setr_epi32(*(int *)m_ptr, *(int *)&m_ptr[m_stride],
+ *(int *)&m_ptr[m_stride * 2],
+ *(int *)&m_ptr[m_stride * 3]);
+ accumulate_block(&src, &a, &b, &m, &sum, &sum_sq);
+
+ src_ptr += src_stride * 4;
+ a_ptr += 16;
+ b_ptr += 16;
+ m_ptr += m_stride * 4;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, sum);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+// For width a multiple of 8
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int w, int h);
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int h);
+
+// For width a multiple of 8
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr, int a_stride,
+ const uint16_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, uint64_t *sse,
+ int *sum_);
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr,
+ const uint16_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int *sse, int *sum_);
+
+#define HIGHBD_MASK_SUBPIX_VAR_SSSE3(W, H) \
+ unsigned int aom_highbd_8_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)sse64; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 4); \
+ sum = ROUND_POWER_OF_TWO(sum, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ uint64_t sse64; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * W]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter(src, src_stride, xoffset, yoffset, temp, W, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance(ref, ref_stride, temp, W, second_pred, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ else \
+ highbd_masked_variance(ref, ref_stride, second_pred, W, temp, W, msk, \
+ msk_stride, W, H, &sse64, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 8); \
+ sum = ROUND_POWER_OF_TWO(sum, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+#define HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(H) \
+ unsigned int aom_highbd_8_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)sse_; \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (4 * H)); \
+ } \
+ unsigned int aom_highbd_10_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 4); \
+ sum = ROUND_POWER_OF_TWO(sum, 2); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ unsigned int aom_highbd_12_masked_sub_pixel_variance4x##H##_ssse3( \
+ const uint8_t *src8, int src_stride, int xoffset, int yoffset, \
+ const uint8_t *ref8, int ref_stride, const uint8_t *second_pred8, \
+ const uint8_t *msk, int msk_stride, int invert_mask, uint32_t *sse) { \
+ int sse_; \
+ int sum; \
+ int64_t var; \
+ uint16_t temp[(H + 1) * 4]; \
+ const uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ const uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+ const uint16_t *second_pred = CONVERT_TO_SHORTPTR(second_pred8); \
+ \
+ highbd_bilinear_filter4xh(src, src_stride, xoffset, yoffset, temp, H); \
+ \
+ if (!invert_mask) \
+ highbd_masked_variance4xh(ref, ref_stride, temp, second_pred, msk, \
+ msk_stride, H, &sse_, &sum); \
+ else \
+ highbd_masked_variance4xh(ref, ref_stride, second_pred, temp, msk, \
+ msk_stride, H, &sse_, &sum); \
+ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_, 8); \
+ sum = ROUND_POWER_OF_TWO(sum, 4); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (4 * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(8)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(4)
+HIGHBD_MASK_SUBPIX_VAR4XH_SSSE3(16)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 4)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(8, 32)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(32, 8)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(16, 64)
+HIGHBD_MASK_SUBPIX_VAR_SSSE3(64, 16)
+
+static INLINE __m128i highbd_filter_block(const __m128i a, const __m128i b,
+ const __m128i filter) {
+ __m128i v0 = _mm_unpacklo_epi16(a, b);
+ v0 = _mm_madd_epi16(v0, filter);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpackhi_epi16(a, b);
+ v1 = _mm_madd_epi16(v1, filter);
+ v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+ return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int w, int h) {
+ int i, j;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ _mm_storeu_si128((__m128i *)&b[j], x);
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else if (xoffset == 4) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+ __m128i z = _mm_alignr_epi8(y, x, 2);
+ _mm_storeu_si128((__m128i *)&b[j], _mm_avg_epu16(x, z));
+ }
+ src += src_stride;
+ b += w;
+ }
+ } else {
+ uint16_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+ for (i = 0; i < h + 1; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&src[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&src[j + 8]);
+ const __m128i z = _mm_alignr_epi8(y, x, 2);
+ const __m128i res = highbd_filter_block(x, z, hfilter_vec);
+ _mm_storeu_si128((__m128i *)&b[j], res);
+ }
+
+ src += src_stride;
+ b += w;
+ }
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ _mm_storeu_si128((__m128i *)&dst[j], _mm_avg_epu16(x, y));
+ }
+ dst += w;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i x = _mm_loadu_si128((__m128i *)&dst[j]);
+ const __m128i y = _mm_loadu_si128((__m128i *)&dst[j + w]);
+ const __m128i res = highbd_filter_block(x, y, vfilter_vec);
+ _mm_storeu_si128((__m128i *)&dst[j], res);
+ }
+
+ dst += w;
+ }
+ }
+}
+
+static INLINE __m128i highbd_filter_block_2rows(const __m128i *a0,
+ const __m128i *b0,
+ const __m128i *a1,
+ const __m128i *b1,
+ const __m128i *filter) {
+ __m128i v0 = _mm_unpacklo_epi16(*a0, *b0);
+ v0 = _mm_madd_epi16(v0, *filter);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ __m128i v1 = _mm_unpacklo_epi16(*a1, *b1);
+ v1 = _mm_madd_epi16(v1, *filter);
+ v1 = xx_roundn_epu32(v1, FILTER_BITS);
+
+ return _mm_packs_epi32(v0, v1);
+}
+
+static void highbd_bilinear_filter4xh(const uint16_t *src, int src_stride,
+ int xoffset, int yoffset, uint16_t *dst,
+ int h) {
+ int i;
+ // Horizontal filter
+ if (xoffset == 0) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)src);
+ _mm_storel_epi64((__m128i *)b, x);
+ src += src_stride;
+ b += 4;
+ }
+ } else if (xoffset == 4) {
+ uint16_t *b = dst;
+ for (i = 0; i < h + 1; ++i) {
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 2);
+ _mm_storel_epi64((__m128i *)b, _mm_avg_epu16(x, z));
+ src += src_stride;
+ b += 4;
+ }
+ } else {
+ uint16_t *b = dst;
+ const uint8_t *hfilter = bilinear_filters_2t[xoffset];
+ const __m128i hfilter_vec = _mm_set1_epi32(hfilter[0] | (hfilter[1] << 16));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x0 = _mm_loadu_si128((__m128i *)src);
+ const __m128i z0 = _mm_srli_si128(x0, 2);
+ const __m128i x1 = _mm_loadu_si128((__m128i *)&src[src_stride]);
+ const __m128i z1 = _mm_srli_si128(x1, 2);
+ const __m128i res =
+ highbd_filter_block_2rows(&x0, &z0, &x1, &z1, &hfilter_vec);
+ _mm_storeu_si128((__m128i *)b, res);
+
+ src += src_stride * 2;
+ b += 8;
+ }
+ // Process i = h separately
+ __m128i x = _mm_loadu_si128((__m128i *)src);
+ __m128i z = _mm_srli_si128(x, 2);
+
+ __m128i v0 = _mm_unpacklo_epi16(x, z);
+ v0 = _mm_madd_epi16(v0, hfilter_vec);
+ v0 = xx_roundn_epu32(v0, FILTER_BITS);
+
+ _mm_storel_epi64((__m128i *)b, _mm_packs_epi32(v0, v0));
+ }
+
+ // Vertical filter
+ if (yoffset == 0) {
+ // The data is already in 'dst', so no need to filter
+ } else if (yoffset == 4) {
+ for (i = 0; i < h; ++i) {
+ __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+ _mm_storel_epi64((__m128i *)dst, _mm_avg_epu16(x, y));
+ dst += 4;
+ }
+ } else {
+ const uint8_t *vfilter = bilinear_filters_2t[yoffset];
+ const __m128i vfilter_vec = _mm_set1_epi32(vfilter[0] | (vfilter[1] << 16));
+ for (i = 0; i < h; i += 2) {
+ const __m128i x = _mm_loadl_epi64((__m128i *)dst);
+ const __m128i y = _mm_loadl_epi64((__m128i *)&dst[4]);
+ const __m128i z = _mm_loadl_epi64((__m128i *)&dst[8]);
+ const __m128i res =
+ highbd_filter_block_2rows(&x, &y, &y, &z, &vfilter_vec);
+ _mm_storeu_si128((__m128i *)dst, res);
+
+ dst += 8;
+ }
+ }
+}
+
+static void highbd_masked_variance(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr, int a_stride,
+ const uint16_t *b_ptr, int b_stride,
+ const uint8_t *m_ptr, int m_stride,
+ int width, int height, uint64_t *sse,
+ int *sum_) {
+ int x, y;
+ // Note on bit widths:
+ // The maximum value of 'sum' is (2^12 - 1) * 128 * 128 =~ 2^26,
+ // so this can be kept as four 32-bit values.
+ // But the maximum value of 'sum_sq' is (2^12 - 1)^2 * 128 * 128 =~ 2^38,
+ // so this must be stored as two 64-bit values.
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ for (y = 0; y < height; y++) {
+ for (x = 0; x < width; x += 8) {
+ const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
+ const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
+ const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
+ const __m128i m =
+ _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&m_ptr[x]), zero);
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ // Calculate 8 predicted pixels.
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+ const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+ __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+ __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+ // A trick: Now each entry of diff_l and diff_r is stored in a 32-bit
+ // field, but the range of values is only [-(2^12 - 1), 2^12 - 1].
+ // So we can re-pack into 16-bit fields and use _mm_madd_epi16
+ // to calculate the squares and partially sum them.
+ const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+ const __m128i prod = _mm_madd_epi16(tmp, tmp);
+ // Then we want to sign-extend to 64 bits and accumulate
+ const __m128i sign = _mm_srai_epi32(prod, 31);
+ const __m128i tmp_0 = _mm_unpacklo_epi32(prod, sign);
+ const __m128i tmp_1 = _mm_unpackhi_epi32(prod, sign);
+ sum_sq = _mm_add_epi64(sum_sq, _mm_add_epi64(tmp_0, tmp_1));
+ }
+
+ src_ptr += src_stride;
+ a_ptr += a_stride;
+ b_ptr += b_stride;
+ m_ptr += m_stride;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, zero);
+ sum = _mm_hadd_epi32(sum, zero);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ sum_sq = _mm_add_epi64(sum_sq, _mm_srli_si128(sum_sq, 8));
+ _mm_storel_epi64((__m128i *)sse, sum_sq);
+}
+
+static void highbd_masked_variance4xh(const uint16_t *src_ptr, int src_stride,
+ const uint16_t *a_ptr,
+ const uint16_t *b_ptr,
+ const uint8_t *m_ptr, int m_stride,
+ int height, int *sse, int *sum_) {
+ int y;
+ // Note: For this function, h <= 8 (or maybe 16 if we add 4:1 partitions).
+ // So the maximum value of sum is (2^12 - 1) * 4 * 16 =~ 2^18
+ // and the maximum value of sum_sq is (2^12 - 1)^2 * 4 * 16 =~ 2^30.
+ // So we can safely pack sum_sq into 32-bit fields, which is slightly more
+ // convenient.
+ __m128i sum = _mm_setzero_si128(), sum_sq = _mm_setzero_si128();
+ const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i zero = _mm_setzero_si128();
+
+ for (y = 0; y < height; y += 2) {
+ __m128i src = _mm_unpacklo_epi64(
+ _mm_loadl_epi64((const __m128i *)src_ptr),
+ _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
+ const __m128i a = _mm_loadu_si128((const __m128i *)a_ptr);
+ const __m128i b = _mm_loadu_si128((const __m128i *)b_ptr);
+ const __m128i m = _mm_unpacklo_epi8(
+ _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr),
+ _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])),
+ zero);
+ const __m128i m_inv = _mm_sub_epi16(mask_max, m);
+
+ const __m128i data_l = _mm_unpacklo_epi16(a, b);
+ const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv);
+ __m128i pred_l = _mm_madd_epi16(data_l, mask_l);
+ pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i data_r = _mm_unpackhi_epi16(a, b);
+ const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv);
+ __m128i pred_r = _mm_madd_epi16(data_r, mask_r);
+ pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i src_l = _mm_unpacklo_epi16(src, zero);
+ const __m128i src_r = _mm_unpackhi_epi16(src, zero);
+ __m128i diff_l = _mm_sub_epi32(pred_l, src_l);
+ __m128i diff_r = _mm_sub_epi32(pred_r, src_r);
+
+ // Update partial sums and partial sums of squares
+ sum = _mm_add_epi32(sum, _mm_add_epi32(diff_l, diff_r));
+ const __m128i tmp = _mm_packs_epi32(diff_l, diff_r);
+ const __m128i prod = _mm_madd_epi16(tmp, tmp);
+ sum_sq = _mm_add_epi32(sum_sq, prod);
+
+ src_ptr += src_stride * 2;
+ a_ptr += 8;
+ b_ptr += 8;
+ m_ptr += m_stride * 2;
+ }
+ // Reduce down to a single sum and sum of squares
+ sum = _mm_hadd_epi32(sum, sum_sq);
+ sum = _mm_hadd_epi32(sum, zero);
+ *sum_ = _mm_cvtsi128_si32(sum);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(sum, 4));
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
+
+void aom_comp_mask_pred_ssse3(uint8_t *comp_pred, const uint8_t *pred,
+ int width, int height, const uint8_t *ref,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ assert(height % 2 == 0);
+ int i = 0;
+ if (width == 8) {
+ comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+ mask, mask_stride);
+ } else if (width == 16) {
+ do {
+ comp_mask_pred_16_ssse3(src0, src1, mask, comp_pred);
+ comp_mask_pred_16_ssse3(src0 + stride0, src1 + stride1,
+ mask + mask_stride, comp_pred + width);
+ comp_pred += (width << 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ i += 2;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ comp_mask_pred_16_ssse3(src0 + x, src1 + x, mask + x, comp_pred);
+ comp_mask_pred_16_ssse3(src0 + x + 16, src1 + x + 16, mask + x + 16,
+ comp_pred + 16);
+ comp_pred += 32;
+ }
+ src0 += (stride0);
+ src1 += (stride1);
+ mask += (mask_stride);
+ i += 1;
+ } while (i < height);
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
new file mode 100644
index 0000000000..4faa098ace
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/masked_variance_intrin_ssse3.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+#define AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
+
+#include <stdlib.h>
+#include <string.h>
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+
+static INLINE void comp_mask_pred_16_ssse3(const uint8_t *src0,
+ const uint8_t *src1,
+ const uint8_t *mask, uint8_t *dst) {
+ const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i round_offset =
+ _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+
+ const __m128i sA0 = _mm_lddqu_si128((const __m128i *)(src0));
+ const __m128i sA1 = _mm_lddqu_si128((const __m128i *)(src1));
+ const __m128i aA = _mm_load_si128((const __m128i *)(mask));
+
+ const __m128i maA = _mm_sub_epi8(alpha_max, aA);
+
+ const __m128i ssAL = _mm_unpacklo_epi8(sA0, sA1);
+ const __m128i aaAL = _mm_unpacklo_epi8(aA, maA);
+ const __m128i ssAH = _mm_unpackhi_epi8(sA0, sA1);
+ const __m128i aaAH = _mm_unpackhi_epi8(aA, maA);
+
+ const __m128i blendAL = _mm_maddubs_epi16(ssAL, aaAL);
+ const __m128i blendAH = _mm_maddubs_epi16(ssAH, aaAH);
+
+ const __m128i roundAL = _mm_mulhrs_epi16(blendAL, round_offset);
+ const __m128i roundAH = _mm_mulhrs_epi16(blendAH, round_offset);
+ _mm_store_si128((__m128i *)dst, _mm_packus_epi16(roundAL, roundAH));
+}
+
+static INLINE void comp_mask_pred_8_ssse3(uint8_t *comp_pred, int height,
+ const uint8_t *src0, int stride0,
+ const uint8_t *src1, int stride1,
+ const uint8_t *mask,
+ int mask_stride) {
+ int i = 0;
+ const __m128i alpha_max = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const __m128i round_offset =
+ _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS));
+ do {
+ // odd line A
+ const __m128i sA0 = _mm_loadl_epi64((const __m128i *)(src0));
+ const __m128i sA1 = _mm_loadl_epi64((const __m128i *)(src1));
+ const __m128i aA = _mm_loadl_epi64((const __m128i *)(mask));
+ // even line B
+ const __m128i sB0 = _mm_loadl_epi64((const __m128i *)(src0 + stride0));
+ const __m128i sB1 = _mm_loadl_epi64((const __m128i *)(src1 + stride1));
+ const __m128i a = _mm_castps_si128(_mm_loadh_pi(
+ _mm_castsi128_ps(aA), (const __m64 *)(mask + mask_stride)));
+
+ const __m128i ssA = _mm_unpacklo_epi8(sA0, sA1);
+ const __m128i ssB = _mm_unpacklo_epi8(sB0, sB1);
+
+ const __m128i ma = _mm_sub_epi8(alpha_max, a);
+ const __m128i aaA = _mm_unpacklo_epi8(a, ma);
+ const __m128i aaB = _mm_unpackhi_epi8(a, ma);
+
+ const __m128i blendA = _mm_maddubs_epi16(ssA, aaA);
+ const __m128i blendB = _mm_maddubs_epi16(ssB, aaB);
+ const __m128i roundA = _mm_mulhrs_epi16(blendA, round_offset);
+ const __m128i roundB = _mm_mulhrs_epi16(blendB, round_offset);
+ const __m128i round = _mm_packus_epi16(roundA, roundB);
+ // comp_pred's stride == width == 8
+ _mm_store_si128((__m128i *)(comp_pred), round);
+ comp_pred += (8 << 1);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ i += 2;
+ } while (i < height);
+}
+
+#endif // AOM_AOM_DSP_X86_MASKED_VARIANCE_INTRIN_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/mem_sse2.h b/third_party/aom/aom_dsp/x86/mem_sse2.h
new file mode 100644
index 0000000000..085a572cb1
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/mem_sse2.h
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
+#define AOM_AOM_DSP_X86_MEM_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+static INLINE int16_t loadu_int16(const void *src) {
+ int16_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE int32_t loadu_int32(const void *src) {
+ int32_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE int64_t loadu_int64(const void *src) {
+ int64_t v;
+ memcpy(&v, src, sizeof(v));
+ return v;
+}
+
+static INLINE void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
+ _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
+}
+
+static INLINE __m128i loadh_epi64(const void *const src, const __m128i s) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
+static INLINE __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
+ loadu_int32((int8_t *)src + 1 * byte_stride),
+ loadu_int32((int8_t *)src + 2 * byte_stride),
+ loadu_int32((int8_t *)src + 3 * byte_stride));
+}
+
+static INLINE __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
+ const int byte_stride) {
+ __m128i dst;
+ dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
+ dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
+ return dst;
+}
+
+static INLINE void store_8bit_8x4_from_16x2(const __m128i *const s,
+ uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
+ _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
+}
+
+static INLINE void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
+ *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
+ *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
+ *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
+}
+
+static INLINE void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ __m128i ss[4];
+
+ ss[0] = s;
+ ss[1] = _mm_srli_si128(s, 4);
+ ss[2] = _mm_srli_si128(s, 8);
+ ss[3] = _mm_srli_si128(s, 12);
+ store_8bit_4x4(ss, d, stride);
+}
+
+static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
+ d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
+ d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
+ d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_4x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_4x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void loadu_8bit_16x4(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
+}
+
+static INLINE void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
+ __m128i *const d) {
+ load_8bit_8x4(s + 0 * stride, stride, &d[0]);
+ load_8bit_8x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void load_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
+ d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
+ d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
+ d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
+ d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
+ d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
+ d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
+ d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
+}
+
+static INLINE void loadu_8bit_16x8(const uint8_t *const s,
+ const ptrdiff_t stride, __m128i *const d) {
+ loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
+ loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
+}
+
+static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
+ _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
+ _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
+ _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
+ _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
+}
+
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
+#endif // AOM_AOM_DSP_X86_MEM_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
new file mode 100644
index 0000000000..210f466b6f
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_sse4.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
+
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+
+static INLINE void obmc_variance_w4(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int h) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_b = _mm_cvtsi32_si128(*(const int *)(pre + n));
+ const __m128i v_m_d = _mm_load_si128((const __m128i *)(mask + n));
+ const __m128i v_w_d = _mm_load_si128((const __m128i *)(wsrc + n));
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSE4_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
new file mode 100644
index 0000000000..27398ffd62
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_intrinsic_ssse3.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+#define AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+ v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if AOM_ARCH_X86_64
+ return _mm_cvtsi128_si64(v_q);
+#else
+ {
+ int64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_q);
+ return tmp;
+ }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+ const __m128i v_sign_d = _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+ const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+ const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+ return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO_SIGNED(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+ const __m128i v_tmp_d =
+ _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#endif // AOM_AOM_DSP_X86_OBMC_INTRINSIC_SSSE3_H_
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
new file mode 100644
index 0000000000..9d1b7d4968
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_avx2.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4_avx2(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ do {
+ const __m128i v_p_b_0 = xx_loadl_32(pre);
+ const __m128i v_p_b_1 = xx_loadl_32(pre + pre_stride);
+ const __m128i v_p_b = _mm_unpacklo_epi32(v_p_b_0, v_p_b_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int obmc_sad_w8n_avx2(
+ const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_b = xx_loadl_64(pre + n);
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p0_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if ((n & (width - 1)) == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4_avx2(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n_avx2(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int hbd_obmc_sad_w4_avx2(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ do {
+ const __m128i v_p_w_0 = xx_loadl_64(pre);
+ const __m128i v_p_w_1 = xx_loadl_64(pre + pre_stride);
+ const __m128i v_p_w = _mm_unpacklo_epi64(v_p_w_0, v_p_w_1);
+ const __m256i v_m_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p_d = _mm256_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p_d, v_m_d);
+
+ const __m256i v_diff_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+ const __m256i v_absdiff_d = _mm256_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff_d, v_bias_d);
+ const __m256i v_rad_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad_d);
+
+ n += 8;
+
+ pre += pre_stride << 1;
+ } while (n < 8 * (height >> 1));
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n_avx2(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m256i v_sad_d = _mm256_setzero_si256();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p0_w = _mm_lddqu_si128((__m128i *)(pre + n));
+ const __m256i v_m0_d = _mm256_lddqu_si256((__m256i *)(mask + n));
+ const __m256i v_w0_d = _mm256_lddqu_si256((__m256i *)(wsrc + n));
+
+ const __m256i v_p0_d = _mm256_cvtepu16_epi32(v_p0_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_absdiff0_d = _mm256_abs_epi32(v_diff0_d);
+
+ // Rounded absolute difference
+ const __m256i v_tmp_d = _mm256_add_epi32(v_absdiff0_d, v_bias_d);
+ const __m256i v_rad0_d = _mm256_srli_epi32(v_tmp_d, 12);
+
+ v_sad_d = _mm256_add_epi32(v_sad_d, v_rad0_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ __m128i v_sad_d_0 = _mm256_castsi256_si128(v_sad_d);
+ __m128i v_sad_d_1 = _mm256_extracti128_si256(v_sad_d, 1);
+ v_sad_d_0 = _mm_add_epi32(v_sad_d_0, v_sad_d_1);
+ return xx_hsum_epi32_si32(v_sad_d_0);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4_avx2(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n_avx2(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 0000000000..542572c761
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/x86/obmc_intrinsic_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_b = xx_loadl_32(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int obmc_sad_w8n(
+ const uint8_t *pre, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h) \
+ unsigned int aom_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *msk) { \
+ if (w == 4) { \
+ return obmc_sad_w4(pre, pre_stride, wsrc, msk, h); \
+ } else { \
+ return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h); \
+ } \
+ }
+
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+OBMCSADWXH(4, 16)
+OBMCSADWXH(16, 4)
+OBMCSADWXH(8, 32)
+OBMCSADWXH(32, 8)
+OBMCSADWXH(16, 64)
+OBMCSADWXH(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+ const int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask,
+ const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static AOM_FORCE_INLINE unsigned int hbd_obmc_sad_w8n(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, const int width, const int height) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - width;
+ int n = 0;
+ __m128i v_sad_d = _mm_setzero_si128();
+
+ assert(width >= 8);
+ assert(IS_POWER_OF_TWO(width));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+ const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+ const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+ // Rounded absolute difference
+ const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+ const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+ v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+ n += 8;
+
+ if (n % width == 0) pre += pre_step;
+ } while (n < width * height);
+
+ return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h) \
+ unsigned int aom_highbd_obmc_sad##w##x##h##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask) { \
+ if (w == 4) { \
+ return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h); \
+ } else { \
+ return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h); \
+ } \
+ }
+
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+HBD_OBMCSADWXH(4, 16)
+HBD_OBMCSADWXH(16, 4)
+HBD_OBMCSADWXH(8, 32)
+HBD_OBMCSADWXH(32, 8)
+HBD_OBMCSADWXH(16, 64)
+HBD_OBMCSADWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
new file mode 100644
index 0000000000..c23d8c4eb0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_avx2.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m128i v_d;
+ const uint8_t *pre_temp;
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadl_epi64((const __m128i *)pre_temp);
+ const __m256i v_m_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m256i v_pm_d = _mm256_madd_epi16(v_p0_d, v_m_d);
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w_d, v_pm_d);
+
+ const __m256i v_sign_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_tmp_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign_d);
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp_d, 12);
+ const __m128i v_rdiff_d = _mm256_castsi256_si128(v_rdiff0_d);
+ const __m128i v_rdiff1_d = _mm256_extracti128_si256(v_rdiff0_d, 1);
+
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 8;
+ n += 8;
+ width -= 8;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+ v_d = _mm_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm_hadd_epi32(v_d, v_d);
+ *sum = _mm_cvtsi128_si32(v_d);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(v_d, 4));
+}
+
+static INLINE void obmc_variance_w16n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ int n = 0, width, height = h;
+ __m256i v_d;
+ __m128i res0;
+ const uint8_t *pre_temp;
+ const __m256i v_bias_d = _mm256_set1_epi32((1 << 12) >> 1);
+ __m256i v_sum_d = _mm256_setzero_si256();
+ __m256i v_sse_d = _mm256_setzero_si256();
+
+ assert(w >= 16);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+ do {
+ width = w;
+ pre_temp = pre;
+ do {
+ const __m128i v_p_b = _mm_loadu_si128((__m128i *)pre_temp);
+ const __m256i v_m0_d = _mm256_loadu_si256((__m256i const *)(mask + n));
+ const __m256i v_w0_d = _mm256_loadu_si256((__m256i const *)(wsrc + n));
+ const __m256i v_m1_d =
+ _mm256_loadu_si256((__m256i const *)(mask + n + 8));
+ const __m256i v_w1_d =
+ _mm256_loadu_si256((__m256i const *)(wsrc + n + 8));
+
+ const __m256i v_p0_d = _mm256_cvtepu8_epi32(v_p_b);
+ const __m256i v_p1_d = _mm256_cvtepu8_epi32(_mm_srli_si128(v_p_b, 8));
+
+ const __m256i v_pm0_d = _mm256_madd_epi16(v_p0_d, v_m0_d);
+ const __m256i v_pm1_d = _mm256_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m256i v_diff0_d = _mm256_sub_epi32(v_w0_d, v_pm0_d);
+ const __m256i v_diff1_d = _mm256_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m256i v_sign0_d = _mm256_srai_epi32(v_diff0_d, 31);
+ const __m256i v_sign1_d = _mm256_srai_epi32(v_diff1_d, 31);
+
+ const __m256i v_tmp0_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff0_d, v_bias_d), v_sign0_d);
+ const __m256i v_tmp1_d =
+ _mm256_add_epi32(_mm256_add_epi32(v_diff1_d, v_bias_d), v_sign1_d);
+
+ const __m256i v_rdiff0_d = _mm256_srai_epi32(v_tmp0_d, 12);
+ const __m256i v_rdiff2_d = _mm256_srai_epi32(v_tmp1_d, 12);
+
+ const __m256i v_rdiff1_d = _mm256_add_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_rdiff01_w = _mm256_packs_epi32(v_rdiff0_d, v_rdiff2_d);
+ const __m256i v_sqrdiff_d = _mm256_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm256_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ pre_temp += 16;
+ n += 16;
+ width -= 16;
+ } while (width > 0);
+ pre += pre_stride;
+ height -= 1;
+ } while (height > 0);
+
+ v_d = _mm256_hadd_epi32(v_sum_d, v_sse_d);
+ v_d = _mm256_hadd_epi32(v_d, v_d);
+ res0 = _mm256_castsi256_si128(v_d);
+ res0 = _mm_add_epi32(res0, _mm256_extractf128_si256(v_d, 1));
+ *sum = _mm_cvtsi128_si32(res0);
+ *sse = (unsigned int)_mm_cvtsi128_si32(_mm_srli_si128(res0, 4));
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_avx2( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else if (W == 8) { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } else { \
+ obmc_variance_w16n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
diff --git a/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 0000000000..89b050eb20
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,382 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+
+#include "aom_dsp/aom_dsp_common.h"
+#include "aom_dsp/aom_filter.h"
+#include "aom_dsp/x86/obmc_intrinsic_sse4.h"
+#include "aom_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter);
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre, const int pre_stride,
+ const int32_t *wsrc, const int32_t *mask,
+ unsigned int *const sse, int *const sum,
+ const int w, const int h) {
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_b = xx_loadl_32(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+ const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H) \
+ unsigned int aom_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ if (W == 4) { \
+ obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H); \
+ } else { \
+ obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H); \
+ } \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+OBMCVARWXH(4, 16)
+OBMCVARWXH(16, 4)
+OBMCVARWXH(8, 32)
+OBMCVARWXH(32, 8)
+OBMCVARWXH(16, 64)
+OBMCVARWXH(64, 16)
+
+#include "config/aom_dsp_rtcd.h"
+
+#define OBMC_SUBPIX_VAR(W, H) \
+ uint32_t aom_obmc_sub_pixel_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \
+ const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \
+ uint16_t fdata3[(H + 1) * W]; \
+ uint8_t temp2[H * W]; \
+ \
+ aom_var_filter_block2d_bil_first_pass_ssse3( \
+ pre, fdata3, pre_stride, 1, H + 1, W, bilinear_filters_2t[xoffset]); \
+ aom_var_filter_block2d_bil_second_pass_ssse3( \
+ fdata3, temp2, W, W, H, W, bilinear_filters_2t[yoffset]); \
+ \
+ return aom_obmc_variance##W##x##H##_sse4_1(temp2, W, wsrc, mask, sse); \
+ }
+
+OBMC_SUBPIX_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 64)
+OBMC_SUBPIX_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 32)
+OBMC_SUBPIX_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 16)
+OBMC_SUBPIX_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 8)
+OBMC_SUBPIX_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 4)
+OBMC_SUBPIX_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 16)
+OBMC_SUBPIX_VAR(16, 4)
+OBMC_SUBPIX_VAR(8, 32)
+OBMC_SUBPIX_VAR(32, 8)
+OBMC_SUBPIX_VAR(16, 64)
+OBMC_SUBPIX_VAR(64, 16)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - 4;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p_w = xx_loadl_64(pre + n);
+ const __m128i v_m_d = xx_load_128(mask + n);
+ const __m128i v_w_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+ const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+ const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+ const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 4;
+
+ if (n % 4 == 0) pre += pre_step;
+ } while (n < 4 * h);
+
+ *sum = xx_hsum_epi32_si32(v_sum_d);
+ *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(
+ const uint8_t *pre8, const int pre_stride, const int32_t *wsrc,
+ const int32_t *mask, uint64_t *const sse, int64_t *const sum, const int w,
+ const int h) {
+ const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+ const int pre_step = pre_stride - w;
+ int n = 0;
+ __m128i v_sum_d = _mm_setzero_si128();
+ __m128i v_sse_d = _mm_setzero_si128();
+
+ assert(w >= 8);
+ assert(IS_POWER_OF_TWO(w));
+ assert(IS_POWER_OF_TWO(h));
+
+ do {
+ const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+ const __m128i v_m1_d = xx_load_128(mask + n + 4);
+ const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+ const __m128i v_p0_w = xx_loadl_64(pre + n);
+ const __m128i v_m0_d = xx_load_128(mask + n);
+ const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+ const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+ const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+ // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+ // boundaries. We use pmaddwd, as it has lower latency on Haswell
+ // than pmulld but produces the same result with these inputs.
+ const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+ const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+ const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+ const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+ const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+ const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+ const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+ const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+ v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+ v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+ n += 8;
+
+ if (n % w == 0) pre += pre_step;
+ } while (n < w * h);
+
+ *sum += xx_hsum_epi32_si64(v_sum_d);
+ *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_8_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ }
+ *sum = (int)sum64;
+ *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else if (w < 128 || h < 128) {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ } else {
+ assert(w == 128 && h == 128);
+
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+ 64);
+ pre8 += 64 * pre_stride;
+ wsrc += 64 * w;
+ mask += 64 * w;
+ h -= 64;
+ } while (h > 0);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+ const int32_t *wsrc,
+ const int32_t *mask, int w, int h,
+ unsigned int *sse, int *sum) {
+ int64_t sum64 = 0;
+ uint64_t sse64 = 0;
+ int max_pel_allowed_per_ovf = 512;
+ if (w == 4) {
+ hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+ } else if (w * h <= max_pel_allowed_per_ovf) {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+ } else {
+ int h_per_ovf = max_pel_allowed_per_ovf / w;
+
+ assert(max_pel_allowed_per_ovf % w == 0);
+ do {
+ hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w,
+ h_per_ovf);
+ pre8 += h_per_ovf * pre_stride;
+ wsrc += h_per_ovf * w;
+ mask += h_per_ovf * w;
+ h -= h_per_ovf;
+ } while (h > 0);
+ }
+ *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+ *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H) \
+ unsigned int aom_highbd_8_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ highbd_8_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \
+ } \
+ \
+ unsigned int aom_highbd_10_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ } \
+ \
+ unsigned int aom_highbd_12_obmc_variance##W##x##H##_sse4_1( \
+ const uint8_t *pre, int pre_stride, const int32_t *wsrc, \
+ const int32_t *mask, unsigned int *sse) { \
+ int sum; \
+ int64_t var; \
+ highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum); \
+ var = (int64_t)(*sse) - (((int64_t)sum * sum) / (W * H)); \
+ return (var >= 0) ? (uint32_t)var : 0; \
+ }
+
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+HBD_OBMCVARWXH(4, 16)
+HBD_OBMCVARWXH(16, 4)
+HBD_OBMCVARWXH(8, 32)
+HBD_OBMCVARWXH(32, 8)
+HBD_OBMCVARWXH(16, 64)
+HBD_OBMCVARWXH(64, 16)
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/quantize_avx2.c b/third_party/aom/aom_dsp/x86/quantize_avx2.c
new file mode 100644
index 0000000000..b808d46778
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_avx2.c
@@ -0,0 +1,274 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void load_b_values_avx2(const int16_t *zbin_ptr, __m256i *zbin,
+ const int16_t *round_ptr, __m256i *round,
+ const int16_t *quant_ptr, __m256i *quant,
+ const int16_t *dequant_ptr,
+ __m256i *dequant,
+ const int16_t *shift_ptr, __m256i *shift,
+ int log_scale) {
+ *zbin = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)zbin_ptr));
+ *zbin = _mm256_permute4x64_epi64(*zbin, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *zbin = _mm256_add_epi16(*zbin, rnd);
+ *zbin = _mm256_srai_epi16(*zbin, log_scale);
+ }
+ // Subtracting 1 here eliminates a _mm256_cmpeq_epi16() instruction when
+ // calculating the zbin mask. (See quantize_b_logscale{0,1,2}_16)
+ *zbin = _mm256_sub_epi16(*zbin, _mm256_set1_epi16(1));
+
+ *round = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)round_ptr));
+ *round = _mm256_permute4x64_epi64(*round, 0x54);
+ if (log_scale > 0) {
+ const __m256i rnd = _mm256_set1_epi16((int16_t)(1 << (log_scale - 1)));
+ *round = _mm256_add_epi16(*round, rnd);
+ *round = _mm256_srai_epi16(*round, log_scale);
+ }
+
+ *quant = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)quant_ptr));
+ *quant = _mm256_permute4x64_epi64(*quant, 0x54);
+ *dequant =
+ _mm256_castsi128_si256(_mm_load_si128((const __m128i *)dequant_ptr));
+ *dequant = _mm256_permute4x64_epi64(*dequant, 0x54);
+ *shift = _mm256_castsi128_si256(_mm_load_si128((const __m128i *)shift_ptr));
+ *shift = _mm256_permute4x64_epi64(*shift, 0x54);
+}
+
+static INLINE __m256i load_coefficients_avx2(const tran_low_t *coeff_ptr) {
+ const __m256i coeff1 = _mm256_load_si256((__m256i *)coeff_ptr);
+ const __m256i coeff2 = _mm256_load_si256((__m256i *)(coeff_ptr + 8));
+ return _mm256_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients_avx2(__m256i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ __m256i coeff_sign = _mm256_srai_epi16(coeff_vals, 15);
+ __m256i coeff_vals_lo = _mm256_unpacklo_epi16(coeff_vals, coeff_sign);
+ __m256i coeff_vals_hi = _mm256_unpackhi_epi16(coeff_vals, coeff_sign);
+ _mm256_store_si256((__m256i *)coeff_ptr, coeff_vals_lo);
+ _mm256_store_si256((__m256i *)(coeff_ptr + 8), coeff_vals_hi);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale0_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+ return _mm256_setzero_si256();
+ }
+
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32 = _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift);
+ const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_dqcoeff = _mm256_mullo_epi16(v_qcoeff, *v_dequant);
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+ return v_nz_mask;
+}
+
+static INLINE __m256i get_max_lane_eob(const int16_t *iscan, __m256i v_eobmax,
+ __m256i v_mask) {
+ const __m256i v_iscan = _mm256_loadu_si256((const __m256i *)iscan);
+ const __m256i v_iscan_perm = _mm256_permute4x64_epi64(v_iscan, 0xD8);
+ const __m256i v_iscan_plus1 = _mm256_sub_epi16(v_iscan_perm, v_mask);
+ const __m256i v_nz_iscan = _mm256_and_si256(v_iscan_plus1, v_mask);
+ return _mm256_max_epi16(v_eobmax, v_nz_iscan);
+}
+
+static INLINE int16_t accumulate_eob256(__m256i eob256) {
+ const __m128i eob_lo = _mm256_castsi256_si128(eob256);
+ const __m128i eob_hi = _mm256_extractf128_si256(eob256, 1);
+ __m128i eob = _mm_max_epi16(eob_lo, eob_hi);
+ __m128i eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+void aom_quantize_b_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan,
+ const int16_t *iscan) {
+ (void)scan;
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, 0);
+
+ // Do DC and first 15 AC.
+ __m256i v_nz_mask =
+ quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask =
+ quantize_b_logscale0_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant,
+ &v_dequant, &v_round, &v_zbin, &v_quant_shift);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+static AOM_FORCE_INLINE __m256i quantize_b_logscale_16(
+ const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, __m256i *v_quant, __m256i *v_dequant,
+ __m256i *v_round, __m256i *v_zbin, __m256i *v_quant_shift, int log_scale) {
+ const __m256i v_coeff = load_coefficients_avx2(coeff_ptr);
+ const __m256i v_abs_coeff = _mm256_abs_epi16(v_coeff);
+ const __m256i v_zbin_mask = _mm256_cmpgt_epi16(v_abs_coeff, *v_zbin);
+
+ if (_mm256_movemask_epi8(v_zbin_mask) == 0) {
+ _mm256_store_si256((__m256i *)qcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)dqcoeff_ptr, _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(qcoeff_ptr + 8), _mm256_setzero_si256());
+ _mm256_store_si256((__m256i *)(dqcoeff_ptr + 8), _mm256_setzero_si256());
+ return _mm256_setzero_si256();
+ }
+
+ // tmp = v_zbin_mask ? (int64_t)abs_coeff + log_scaled_round : 0
+ const __m256i v_tmp_rnd =
+ _mm256_and_si256(_mm256_adds_epi16(v_abs_coeff, *v_round), v_zbin_mask);
+ // tmp32 = (int)(((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) *
+ // quant_shift_ptr[rc != 0]) >>
+ // (16 - log_scale + AOM_QM_BITS));
+ const __m256i v_tmp32_a = _mm256_mulhi_epi16(v_tmp_rnd, *v_quant);
+ const __m256i v_tmp32_b = _mm256_add_epi16(v_tmp32_a, v_tmp_rnd);
+ const __m256i v_tmp32_hi = _mm256_slli_epi16(
+ _mm256_mulhi_epi16(v_tmp32_b, *v_quant_shift), log_scale);
+ const __m256i v_tmp32_lo = _mm256_srli_epi16(
+ _mm256_mullo_epi16(v_tmp32_b, *v_quant_shift), 16 - log_scale);
+ const __m256i v_tmp32 = _mm256_or_si256(v_tmp32_hi, v_tmp32_lo);
+ const __m256i v_dqcoeff_hi = _mm256_slli_epi16(
+ _mm256_mulhi_epi16(v_tmp32, *v_dequant), 16 - log_scale);
+ const __m256i v_dqcoeff_lo =
+ _mm256_srli_epi16(_mm256_mullo_epi16(v_tmp32, *v_dequant), log_scale);
+ const __m256i v_dqcoeff =
+ _mm256_sign_epi16(_mm256_or_si256(v_dqcoeff_hi, v_dqcoeff_lo), v_coeff);
+ const __m256i v_qcoeff = _mm256_sign_epi16(v_tmp32, v_coeff);
+ const __m256i v_nz_mask = _mm256_cmpgt_epi16(v_tmp32, _mm256_setzero_si256());
+ store_coefficients_avx2(v_qcoeff, qcoeff_ptr);
+ store_coefficients_avx2(v_dqcoeff, dqcoeff_ptr);
+ return v_nz_mask;
+}
+
+static AOM_FORCE_INLINE void quantize_b_no_qmatrix_avx2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *iscan, int log_scale) {
+ __m256i v_zbin, v_round, v_quant, v_dequant, v_quant_shift;
+ __m256i v_eobmax = _mm256_setzero_si256();
+
+ load_b_values_avx2(zbin_ptr, &v_zbin, round_ptr, &v_round, quant_ptr,
+ &v_quant, dequant_ptr, &v_dequant, quant_shift_ptr,
+ &v_quant_shift, log_scale);
+
+ // Do DC and first 15 AC.
+ __m256i v_nz_mask = quantize_b_logscale_16(
+ coeff_ptr, qcoeff_ptr, dqcoeff_ptr, &v_quant, &v_dequant, &v_round,
+ &v_zbin, &v_quant_shift, log_scale);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+
+ v_round = _mm256_unpackhi_epi64(v_round, v_round);
+ v_quant = _mm256_unpackhi_epi64(v_quant, v_quant);
+ v_dequant = _mm256_unpackhi_epi64(v_dequant, v_dequant);
+ v_quant_shift = _mm256_unpackhi_epi64(v_quant_shift, v_quant_shift);
+ v_zbin = _mm256_unpackhi_epi64(v_zbin, v_zbin);
+
+ for (intptr_t count = n_coeffs - 16; count > 0; count -= 16) {
+ coeff_ptr += 16;
+ qcoeff_ptr += 16;
+ dqcoeff_ptr += 16;
+ iscan += 16;
+ v_nz_mask = quantize_b_logscale_16(coeff_ptr, qcoeff_ptr, dqcoeff_ptr,
+ &v_quant, &v_dequant, &v_round, &v_zbin,
+ &v_quant_shift, log_scale);
+
+ v_eobmax = get_max_lane_eob(iscan, v_eobmax, v_nz_mask);
+ }
+
+ *eob_ptr = accumulate_eob256(v_eobmax);
+}
+
+void aom_quantize_b_32x32_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 1);
+}
+
+void aom_quantize_b_64x64_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ (void)scan;
+ quantize_b_no_qmatrix_avx2(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
+ quant_ptr, quant_shift_ptr, qcoeff_ptr,
+ dqcoeff_ptr, dequant_ptr, eob_ptr, iscan, 2);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_sse2.c b/third_party/aom/aom_dsp/x86/quantize_sse2.c
new file mode 100644
index 0000000000..ebef1fbac2
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_sse2.c
@@ -0,0 +1,125 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr, const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob, eob0;
+
+ (void)scan_ptr;
+
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr, 0, zero);
+
+ // AC only loop.
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan_ptr,
+ index, zero);
+ eob = _mm_max_epi16(eob, eob0);
+
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3.c b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
new file mode 100644
index 0000000000..25980a055a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3.c
@@ -0,0 +1,192 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+static INLINE void calculate_qcoeff_64x64(__m128i *coeff, const __m128i round,
+ const __m128i quant,
+ const __m128i *shift) {
+ __m128i tmp, qcoeff, tmp1;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, 14);
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, 2);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE void calculate_dqcoeff_and_store_64x64(const __m128i qcoeff,
+ const __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff) {
+ // Un-sign to bias rounding like C.
+ const __m128i coeff = _mm_abs_epi16(qcoeff);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(zero, qcoeff);
+ const __m128i sign_1 = _mm_unpackhi_epi16(zero, qcoeff);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ // "Divide" by 4.
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, 2);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, 2);
+
+ dqcoeff32_0 = _mm_sign_epi32(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = _mm_sign_epi32(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+void aom_quantize_b_64x64_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ const int16_t *zbin_ptr,
+ const int16_t *round_ptr,
+ const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+ const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i two = _mm_set1_epi16(2);
+ int index;
+
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1, all_zero;
+ __m128i eob = zero, eob0;
+
+ (void)scan;
+ (void)n_coeffs;
+
+ // Setup global values.
+ zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ round = _mm_load_si128((const __m128i *)round_ptr);
+ quant = _mm_load_si128((const __m128i *)quant_ptr);
+ dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ shift = _mm_load_si128((const __m128i *)quant_shift_ptr);
+
+ // Shift with rounding.
+ zbin = _mm_add_epi16(zbin, two);
+ round = _mm_add_epi16(round, two);
+ zbin = _mm_srli_epi16(zbin, 2);
+ round = _mm_srli_epi16(round, 2);
+ zbin = _mm_sub_epi16(zbin, one);
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ zbin = _mm_unpackhi_epi64(zbin, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ // Reinsert signs.
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ // Mask out zbin threshold coeffs.
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero, dqcoeff_ptr);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero, dqcoeff_ptr + 8);
+
+ eob =
+ scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ for (index = 16; index < 1024; index += 16) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ qcoeff0 = _mm_abs_epi16(coeff0);
+ qcoeff1 = _mm_abs_epi16(coeff1);
+
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ continue;
+ }
+ calculate_qcoeff_64x64(&qcoeff0, round, quant, &shift);
+ calculate_qcoeff_64x64(&qcoeff1, round, quant, &shift);
+
+ qcoeff0 = _mm_sign_epi16(qcoeff0, coeff0);
+ qcoeff1 = _mm_sign_epi16(qcoeff1, coeff1);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ calculate_dqcoeff_and_store_64x64(qcoeff0, dequant, zero,
+ dqcoeff_ptr + index);
+ calculate_dqcoeff_and_store_64x64(qcoeff1, dequant, zero,
+ dqcoeff_ptr + 8 + index);
+
+ eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
new file mode 100644
index 0000000000..fa616a6f1a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_ssse3_x86_64.asm
@@ -0,0 +1,302 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_1: times 8 dw 1
+
+SECTION .text
+
+%macro QUANTIZE_FN 2
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
+ shift, qcoeff, dqcoeff, dequant, \
+ eob, scan, iscan
+
+ ; actual quantize loop - setup pointers, rounders, etc.
+ movifnidn coeffq, coeffmp
+ movifnidn ncoeffq, ncoeffmp
+ movifnidn zbinq, zbinmp
+ movifnidn roundq, roundmp
+ movifnidn quantq, quantmp
+ movifnidn dequantq, dequantmp
+ mova m0, [zbinq] ; m0 = zbin
+ mova m1, [roundq] ; m1 = round
+ mova m2, [quantq] ; m2 = quant
+%ifidn %1, b_32x32
+ pcmpeqw m5, m5
+ psrlw m5, 15
+ paddw m0, m5
+ paddw m1, m5
+ psrlw m0, 1 ; m0 = (m0 + 1) / 2
+ psrlw m1, 1 ; m1 = (m1 + 1) / 2
+%endif
+ mova m3, [dequantq] ; m3 = dequant
+ mov r2, shiftmp
+ psubw m0, [GLOBAL(pw_1)]
+ mova m4, [r2] ; m4 = shift
+ mov r3, qcoeffmp
+ mov r4, dqcoeffmp
+ mov r5, iscanmp
+ pxor m5, m5 ; m5 = dedicated zero
+ DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
+ lea coeffq, [ coeffq+ncoeffq*4]
+ lea qcoeffq, [ qcoeffq+ncoeffq*4]
+ lea dqcoeffq, [dqcoeffq+ncoeffq*4]
+ lea iscanq, [ iscanq+ncoeffq*2]
+ neg ncoeffq
+
+ ; get DC and first 15 AC coeffs
+ ; coeff stored as 32bit numbers & require 16bit numbers
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ punpckhqdq m0, m0
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+ paddsw m6, m1 ; m6 += round
+ punpckhqdq m1, m1
+ paddsw m11, m1 ; m11 += round
+ pmulhw m8, m6, m2 ; m8 = m6*q>>16
+ punpckhqdq m2, m2
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m8, m6 ; m8 += m6
+ paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m8, m4 ; store the lower 16 bits of m8*qsh
+ %endif
+ pmulhw m8, m4 ; m8 = m8*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m8, 1
+ psrlw m5, 15
+ por m8, m5
+ %endif
+ punpckhqdq m4, m4
+ %ifidn %1, b_32x32
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
+ psignw m8, m9 ; m8 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m8, m7
+ pand m13, m12
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+ pabsw m8, m8
+ pabsw m13, m13
+%endif
+ pmullw m8, m3 ; dqc[i] = qc[i] * q
+ punpckhqdq m3, m3
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m8, 1
+ psrlw m13, 1
+ psignw m8, m9
+ psignw m13, m10
+%endif
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+ mova m11, m8
+ mova m6, m8
+ pcmpgtw m5, m8
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+ pcmpeqw m8, m5 ; m8 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m8, m6 ; m8 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jz .accumulate_eob
+
+.ac_only_loop:
+ ; pack coeff from 32bit to 16bit array
+ mova m9, [ coeffq+ncoeffq*4+ 0]
+ packssdw m9, [ coeffq+ncoeffq*4+16]
+ mova m10, [ coeffq+ncoeffq*4+32]
+ packssdw m10, [ coeffq+ncoeffq*4+48]
+
+ pabsw m6, m9 ; m6 = abs(m9)
+ pabsw m11, m10 ; m11 = abs(m10)
+ pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin
+ pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin
+%ifidn %1, b_32x32
+ pmovmskb r6d, m7
+ pmovmskb r2d, m12
+ or r6, r2
+ jz .skip_iter
+%endif
+ paddsw m6, m1 ; m6 += round
+ paddsw m11, m1 ; m11 += round
+ pmulhw m14, m6, m2 ; m14 = m6*q>>16
+ pmulhw m13, m11, m2 ; m13 = m11*q>>16
+ paddw m14, m6 ; m14 += m6
+ paddw m13, m11 ; m13 += m11
+ %ifidn %1, b_32x32
+ pmullw m5, m14, m4 ; store the lower 16 bits of m14*qsh
+ %endif
+ pmulhw m14, m4 ; m14 = m14*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m14, 1
+ psrlw m5, 15
+ por m14, m5
+ pmullw m5, m13, m4 ; store the lower 16 bits of m13*qsh
+ %endif
+ pmulhw m13, m4 ; m13 = m13*qsh>>16
+ %ifidn %1, b_32x32
+ psllw m13, 1
+ psrlw m5, 15
+ por m13, m5
+ pxor m5, m5 ; reset m5 to zero register
+ %endif
+ psignw m14, m9 ; m14 = reinsert sign
+ psignw m13, m10 ; m13 = reinsert sign
+ pand m14, m7
+ pand m13, m12
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
+ pxor m11, m11
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+ 0], m11
+ mova [qcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [qcoeffq+ncoeffq*4+32], m11
+ mova [qcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5 ; reset m5 to zero register
+
+%ifidn %1, b_32x32
+ pabsw m14, m14
+ pabsw m13, m13
+%endif
+ pmullw m14, m3 ; dqc[i] = qc[i] * q
+ pmullw m13, m3 ; dqc[i] = qc[i] * q
+%ifidn %1, b_32x32
+ psrlw m14, 1
+ psrlw m13, 1
+ psignw m14, m9
+ psignw m13, m10
+%endif
+
+ ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
+ mova m11, m14
+ mova m6, m14
+ pcmpgtw m5, m14
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m11
+ mova [dqcoeffq+ncoeffq*4+16], m6
+ pxor m5, m5
+ mova m11, m13
+ mova m6, m13
+ pcmpgtw m5, m13
+ punpcklwd m11, m5
+ punpckhwd m6, m5
+ mova [dqcoeffq+ncoeffq*4+32], m11
+ mova [dqcoeffq+ncoeffq*4+48], m6
+ pxor m5, m5
+
+ pcmpeqw m14, m5 ; m14 = c[i] == 0
+ pcmpeqw m13, m5 ; m13 = c[i] == 0
+ mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
+ mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i]
+ psubw m6, m7 ; m6 = scan[i] + 1
+ psubw m11, m12 ; m11 = scan[i] + 1
+ pandn m14, m6 ; m14 = max(eob)
+ pandn m13, m11 ; m13 = max(eob)
+ pmaxsw m8, m14
+ pmaxsw m8, m13
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+
+%ifidn %1, b_32x32
+ jmp .accumulate_eob
+.skip_iter:
+ mova [qcoeffq+ncoeffq*4+ 0], m5
+ mova [qcoeffq+ncoeffq*4+16], m5
+ mova [qcoeffq+ncoeffq*4+32], m5
+ mova [qcoeffq+ncoeffq*4+48], m5
+ mova [dqcoeffq+ncoeffq*4+ 0], m5
+ mova [dqcoeffq+ncoeffq*4+16], m5
+ mova [dqcoeffq+ncoeffq*4+32], m5
+ mova [dqcoeffq+ncoeffq*4+48], m5
+ add ncoeffq, mmsize
+ jl .ac_only_loop
+%endif
+
+.accumulate_eob:
+ ; horizontally accumulate/max eobs and write into [eob] memory pointer
+ mov r2, eobmp
+ pshufd m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0xe
+ pmaxsw m8, m7
+ pshuflw m7, m8, 0x1
+ pmaxsw m8, m7
+ pextrw r6, m8, 0
+ mov [r2], r6
+ RET
+%endmacro
+
+INIT_XMM ssse3
+QUANTIZE_FN b, 9
+QUANTIZE_FN b_32x32, 9
diff --git a/third_party/aom/aom_dsp/x86/quantize_x86.h b/third_party/aom/aom_dsp/x86/quantize_x86.h
new file mode 100644
index 0000000000..5b040a278a
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/quantize_x86.h
@@ -0,0 +1,202 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "aom/aom_integer.h"
+
+static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
+ const int16_t *round_ptr, __m128i *round,
+ const int16_t *quant_ptr, __m128i *quant,
+ const int16_t *dequant_ptr, __m128i *dequant,
+ const int16_t *shift_ptr, __m128i *shift) {
+ *zbin = _mm_load_si128((const __m128i *)zbin_ptr);
+ *round = _mm_load_si128((const __m128i *)round_ptr);
+ *quant = _mm_load_si128((const __m128i *)quant_ptr);
+ *zbin = _mm_sub_epi16(*zbin, _mm_set1_epi16(1));
+ *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ *shift = _mm_load_si128((const __m128i *)shift_ptr);
+}
+
+// With ssse3 and later abs() and sign() are preferred.
+static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi16(a, sign);
+}
+
+static INLINE __m128i invert_sign_32_sse2(__m128i a, __m128i sign) {
+ a = _mm_xor_si128(a, sign);
+ return _mm_sub_epi32(a, sign);
+}
+
+static INLINE void calculate_qcoeff(__m128i *coeff, const __m128i round,
+ const __m128i quant, const __m128i shift) {
+ __m128i tmp, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ *coeff = _mm_mulhi_epi16(qcoeff, shift);
+}
+
+static INLINE void calculate_qcoeff_log_scale(__m128i *coeff,
+ const __m128i round,
+ const __m128i quant,
+ const __m128i *shift,
+ const int *log_scale) {
+ __m128i tmp, tmp1, qcoeff;
+ qcoeff = _mm_adds_epi16(*coeff, round);
+ tmp = _mm_mulhi_epi16(qcoeff, quant);
+ qcoeff = _mm_add_epi16(tmp, qcoeff);
+ tmp = _mm_mullo_epi16(qcoeff, *shift);
+ tmp = _mm_srli_epi16(tmp, (16 - *log_scale));
+ tmp1 = _mm_mulhi_epi16(qcoeff, *shift);
+ tmp1 = _mm_slli_epi16(tmp1, *log_scale);
+ *coeff = _mm_or_si128(tmp, tmp1);
+}
+
+static INLINE __m128i calculate_dqcoeff(__m128i qcoeff, __m128i dequant) {
+ return _mm_mullo_epi16(qcoeff, dequant);
+}
+
+static INLINE void calculate_dqcoeff_and_store_log_scale(__m128i qcoeff,
+ __m128i dequant,
+ const __m128i zero,
+ tran_low_t *dqcoeff,
+ const int *log_scale) {
+ // calculate abs
+ __m128i coeff_sign = _mm_srai_epi16(qcoeff, 15);
+ __m128i coeff = invert_sign_sse2(qcoeff, coeff_sign);
+
+ const __m128i sign_0 = _mm_unpacklo_epi16(coeff_sign, zero);
+ const __m128i sign_1 = _mm_unpackhi_epi16(coeff_sign, zero);
+
+ const __m128i low = _mm_mullo_epi16(coeff, dequant);
+ const __m128i high = _mm_mulhi_epi16(coeff, dequant);
+ __m128i dqcoeff32_0 = _mm_unpacklo_epi16(low, high);
+ __m128i dqcoeff32_1 = _mm_unpackhi_epi16(low, high);
+
+ dqcoeff32_0 = _mm_srli_epi32(dqcoeff32_0, *log_scale);
+ dqcoeff32_1 = _mm_srli_epi32(dqcoeff32_1, *log_scale);
+
+ dqcoeff32_0 = invert_sign_32_sse2(dqcoeff32_0, sign_0);
+ dqcoeff32_1 = invert_sign_32_sse2(dqcoeff32_1, sign_1);
+
+ _mm_store_si128((__m128i *)(dqcoeff), dqcoeff32_0);
+ _mm_store_si128((__m128i *)(dqcoeff + 4), dqcoeff32_1);
+}
+
+// Scan 16 values for eob reference in scan_ptr. Use masks (-1) from comparing
+// to zbin to add 1 to the index in 'scan'.
+static INLINE __m128i scan_for_eob(__m128i *coeff0, __m128i *coeff1,
+ const __m128i zbin_mask0,
+ const __m128i zbin_mask1,
+ const int16_t *scan_ptr, const int index,
+ const __m128i zero) {
+ const __m128i zero_coeff0 = _mm_cmpeq_epi16(*coeff0, zero);
+ const __m128i zero_coeff1 = _mm_cmpeq_epi16(*coeff1, zero);
+ __m128i scan0 = _mm_load_si128((const __m128i *)(scan_ptr + index));
+ __m128i scan1 = _mm_load_si128((const __m128i *)(scan_ptr + index + 8));
+ __m128i eob0, eob1;
+ // Add one to convert from indices to counts
+ scan0 = _mm_sub_epi16(scan0, zbin_mask0);
+ scan1 = _mm_sub_epi16(scan1, zbin_mask1);
+ eob0 = _mm_andnot_si128(zero_coeff0, scan0);
+ eob1 = _mm_andnot_si128(zero_coeff1, scan1);
+ return _mm_max_epi16(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+ __m128i eob_shuffled;
+ eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
+ eob = _mm_max_epi16(eob, eob_shuffled);
+ return _mm_extract_epi16(eob, 1);
+}
+
+static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+ const __m128i coeff1 = _mm_load_si128((__m128i *)(coeff_ptr));
+ const __m128i coeff2 = _mm_load_si128((__m128i *)(coeff_ptr + 4));
+ return _mm_packs_epi32(coeff1, coeff2);
+}
+
+static INLINE void store_coefficients(__m128i coeff_vals,
+ tran_low_t *coeff_ptr) {
+ assert(sizeof(tran_low_t) == 4);
+
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i all_zero;
+ __m128i temp_mask = _mm_setzero_si128();
+ all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+ if (_mm_movemask_epi8(all_zero)) {
+ __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+ __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+ __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+ temp_mask = _mm_max_epi16(mask0, mask1);
+ *is_found = 1;
+ }
+ *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+ __m128i *threshold, const int16_t *iscan_ptr,
+ int *is_found, __m128i *mask) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+ coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+ coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+ coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+ coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+ coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+ cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+ cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+ coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+ cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+ coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+ cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+ cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+ __m128i mask0, mask1;
+ int non_zero_count = 0;
+ mask0 = _mm_unpackhi_epi64(mask, mask);
+ mask1 = _mm_max_epi16(mask0, mask);
+ mask0 = _mm_shuffle_epi32(mask1, 1);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ mask1 = _mm_srli_epi32(mask0, 16);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+ return non_zero_count;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad4d_avx2.c b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
new file mode 100644
index 0000000000..0fea6ddfd3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static AOM_FORCE_INLINE void aggregate_and_store_sum(uint32_t res[4],
+ const __m256i *sum_ref0,
+ const __m256i *sum_ref1,
+ const __m256i *sum_ref2,
+ const __m256i *sum_ref3) {
+ // In sum_ref-i the result is saved in the first 4 bytes and the other 4
+ // bytes are zeroed.
+ // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
+ // 0, 0, 1, 1
+ __m256i sum_ref01 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(*sum_ref0), _mm256_castsi256_ps(*sum_ref1),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+ // 2, 2, 3, 3
+ __m256i sum_ref23 = _mm256_castps_si256(_mm256_shuffle_ps(
+ _mm256_castsi256_ps(*sum_ref2), _mm256_castsi256_ps(*sum_ref3),
+ _MM_SHUFFLE(2, 0, 2, 0)));
+
+ // sum adjacent 32 bit integers
+ __m256i sum_ref0123 = _mm256_hadd_epi32(sum_ref01, sum_ref23);
+
+ // add the low 128 bit to the high 128 bit
+ __m128i sum = _mm_add_epi32(_mm256_castsi256_si128(sum_ref0123),
+ _mm256_extractf128_si256(sum_ref0123, 1));
+
+ _mm_storeu_si128((__m128i *)(res), sum);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx4d_avx2(
+ int M, int N, const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ int i, j;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+ sum_ref3 = _mm256_setzero_si256();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j += 32) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+ ref3_reg = _mm256_loadu_si256((const __m256i *)(ref3 + j));
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+ }
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ ref3 += ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+static AOM_FORCE_INLINE void aom_sadMxNx3d_avx2(
+ int M, int N, const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride, uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ int i, j;
+ const uint8_t *ref0, *ref1, *ref2;
+ const __m256i zero = _mm256_setzero_si256();
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (i = 0; i < N; i++) {
+ for (j = 0; j < M; j += 32) {
+ // load src and all refs
+ src_reg = _mm256_loadu_si256((const __m256i *)(src + j));
+ ref0_reg = _mm256_loadu_si256((const __m256i *)(ref0 + j));
+ ref1_reg = _mm256_loadu_si256((const __m256i *)(ref1 + j));
+ ref2_reg = _mm256_loadu_si256((const __m256i *)(ref2 + j));
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ }
+ src += src_stride;
+ ref0 += ref_stride;
+ ref1 += ref_stride;
+ ref2 += ref_stride;
+ }
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+#define SADMXN_AVX2(m, n) \
+ void aom_sad##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sadMxNx4d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
+ } \
+ void aom_sad##m##x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sadMxNx3d_avx2(m, n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SADMXN_AVX2(32, 8)
+SADMXN_AVX2(32, 16)
+SADMXN_AVX2(32, 32)
+SADMXN_AVX2(32, 64)
+
+SADMXN_AVX2(64, 16)
+SADMXN_AVX2(64, 32)
+SADMXN_AVX2(64, 64)
+SADMXN_AVX2(64, 128)
+
+SADMXN_AVX2(128, 64)
+SADMXN_AVX2(128, 128)
+
+#define SAD_SKIP_MXN_AVX2(m, n) \
+ void aom_sad_skip_##m##x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ aom_sadMxNx4d_avx2(m, ((n) >> 1), src, 2 * src_stride, ref, \
+ 2 * ref_stride, res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_MXN_AVX2(32, 8)
+SAD_SKIP_MXN_AVX2(32, 16)
+SAD_SKIP_MXN_AVX2(32, 32)
+SAD_SKIP_MXN_AVX2(32, 64)
+
+SAD_SKIP_MXN_AVX2(64, 16)
+SAD_SKIP_MXN_AVX2(64, 32)
+SAD_SKIP_MXN_AVX2(64, 64)
+SAD_SKIP_MXN_AVX2(64, 128)
+
+SAD_SKIP_MXN_AVX2(128, 64)
+SAD_SKIP_MXN_AVX2(128, 128)
+
+static AOM_FORCE_INLINE void aom_sad16xNx3d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2;
+ const uint8_t *ref0, *ref1, *ref2;
+ const __m256i zero = _mm256_setzero_si256();
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &zero);
+}
+
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+ int src_stride,
+ const uint8_t *const ref[4],
+ int ref_stride,
+ uint32_t res[4]) {
+ __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+ __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+ const uint8_t *ref0, *ref1, *ref2, *ref3;
+ assert(N % 2 == 0);
+
+ ref0 = ref[0];
+ ref1 = ref[1];
+ ref2 = ref[2];
+ ref3 = ref[3];
+
+ sum_ref0 = _mm256_setzero_si256();
+ sum_ref2 = _mm256_setzero_si256();
+ sum_ref1 = _mm256_setzero_si256();
+ sum_ref3 = _mm256_setzero_si256();
+
+ for (int i = 0; i < N; i += 2) {
+ // load src and all refs
+ src_reg = yy_loadu2_128(src + src_stride, src);
+ ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+ ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+ ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+ ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+ // sum of the absolute differences between every ref-i to src
+ ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+ ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+ ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+ ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+ // sum every ref-i
+ sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+ sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+ sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+ sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+ src += 2 * src_stride;
+ ref0 += 2 * ref_stride;
+ ref1 += 2 * ref_stride;
+ ref2 += 2 * ref_stride;
+ ref3 += 2 * ref_stride;
+ }
+
+ aggregate_and_store_sum(res, &sum_ref0, &sum_ref1, &sum_ref2, &sum_ref3);
+}
+
+#define SAD16XNX3_AVX2(n) \
+ void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+#define SAD16XNX4_AVX2(n) \
+ void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], int ref_stride, \
+ uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res); \
+ }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
+
+SAD16XNX3_AVX2(32)
+SAD16XNX3_AVX2(16)
+SAD16XNX3_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD16XNX3_AVX2(64)
+SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n) \
+ void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride, \
+ const uint8_t *const ref[4], \
+ int ref_stride, uint32_t res[4]) { \
+ aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+ res); \
+ res[0] <<= 1; \
+ res[1] <<= 1; \
+ res[2] <<= 1; \
+ res[3] <<= 1; \
+ }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
+#endif // !CONFIG_REALTIME_ONLY
diff --git a/third_party/aom/aom_dsp/x86/sad4d_sse2.asm b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
new file mode 100644
index 0000000000..6edad99516
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad4d_sse2.asm
@@ -0,0 +1,437 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; 'spill_src_stride' affect a lot how the code works.
+;
+; When 'spill_src_stride' is false, the 'src_strideq' resides in
+; register, [srcq + src_strideq + offset] is allowed, so we can simply
+; use such form to access src memory and don't bother to update 'srcq'
+; at each line. We only update 'srcq' each two-lines using a compact
+; LEA instruction like [srcq+src_strideq*2].
+;
+; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
+; we cannot use above form to access memory, we have to update
+; 'srcq' at each line break. As we process two parts (first,second)
+; together in each macro function, the second part may also sit
+; in the next line, which means we also need to possibly add
+; one 'src_strideq' to 'srcq' before processing second part.
+
+%macro HANDLE_SECOND_OFFSET 0
+ %if spill_src_stride
+ %define second_offset 0
+ add srcq, src_strideq
+ %else
+ %define second_offset (src_strideq)
+ %endif
+%endmacro
+
+; This is specically designed to handle when src_strideq is a
+; memory position, under such case, we can not accomplish
+; complex address calculation using LEA, and fall back to
+; using simple ADD instruction at each line ending.
+%macro ADVANCE_END_OF_TWO_LINES 0
+ %if spill_src_stride
+ add srcq, src_strideq
+ %else
+ lea srcq, [srcq+src_strideq*2]
+ %endif
+
+; note: ref_stride is never spilled when processing two lines
+ lea ref1q, [ref1q+ref_strideq*2]
+ lea ref2q, [ref2q+ref_strideq*2]
+ lea ref3q, [ref3q+ref_strideq*2]
+ lea ref4q, [ref4q+ref_strideq*2]
+%endmacro
+
+; PROCESS_4x2x4 first
+%macro PROCESS_4x2x4 1
+ movd m0, [srcq]
+ HANDLE_SECOND_OFFSET
+%if %1 == 1
+ movd m6, [ref1q]
+ movd m4, [ref2q]
+ movd m7, [ref3q]
+ movd m5, [ref4q]
+
+ movd m1, [srcq + second_offset]
+ movd m2, [ref1q+ref_strideq]
+ punpckldq m0, m1
+ punpckldq m6, m2
+ movd m1, [ref2q+ref_strideq]
+ movd m2, [ref3q+ref_strideq]
+ movd m3, [ref4q+ref_strideq]
+ punpckldq m4, m1
+ punpckldq m7, m2
+ punpckldq m5, m3
+ movlhps m0, m0
+ movlhps m6, m4
+ movlhps m7, m5
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movd m1, [ref1q]
+ movd m5, [ref1q+ref_strideq]
+ movd m2, [ref2q]
+ movd m4, [ref2q+ref_strideq]
+ punpckldq m1, m5
+ punpckldq m2, m4
+ movd m3, [ref3q]
+ movd m5, [ref3q+ref_strideq]
+ punpckldq m3, m5
+ movd m4, [ref4q]
+ movd m5, [ref4q+ref_strideq]
+ punpckldq m4, m5
+ movd m5, [srcq + second_offset]
+ punpckldq m0, m5
+ movlhps m0, m0
+ movlhps m1, m2
+ movlhps m3, m4
+ psadbw m1, m0
+ psadbw m3, m0
+ paddd m6, m1
+ paddd m7, m3
+%endif
+%endmacro
+
+; PROCESS_8x2x4 first
+%macro PROCESS_8x2x4 1
+ movh m0, [srcq]
+ HANDLE_SECOND_OFFSET
+%if %1 == 1
+ movh m4, [ref1q]
+ movh m5, [ref2q]
+ movh m6, [ref3q]
+ movh m7, [ref4q]
+ movhps m0, [srcq + second_offset]
+ movhps m4, [ref1q+ref_strideq]
+ movhps m5, [ref2q+ref_strideq]
+ movhps m6, [ref3q+ref_strideq]
+ movhps m7, [ref4q+ref_strideq]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%else
+ movh m1, [ref1q]
+ movh m2, [ref2q]
+ movhps m0, [srcq + second_offset]
+ movhps m1, [ref1q+ref_strideq]
+ movhps m2, [ref2q+ref_strideq]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m4, m1
+ paddd m5, m2
+
+ movh m1, [ref3q]
+ movhps m1, [ref3q+ref_strideq]
+ movh m2, [ref4q]
+ movhps m2, [ref4q+ref_strideq]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m6, m1
+ paddd m7, m2
+%endif
+%endmacro
+
+; PROCESS_FIRST_MMSIZE
+%macro PROCESS_FIRST_MMSIZE 0
+ mova m0, [srcq]
+ movu m4, [ref1q]
+ movu m5, [ref2q]
+ movu m6, [ref3q]
+ movu m7, [ref4q]
+ psadbw m4, m0
+ psadbw m5, m0
+ psadbw m6, m0
+ psadbw m7, m0
+%endmacro
+
+; PROCESS_16x1x4 offset
+%macro PROCESS_16x1x4 1
+ mova m0, [srcq + %1]
+ movu m1, [ref1q + ref_offsetq + %1]
+ movu m2, [ref2q + ref_offsetq + %1]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m4, m1
+ paddd m5, m2
+
+ movu m1, [ref3q + ref_offsetq + %1]
+ movu m2, [ref4q + ref_offsetq + %1]
+ psadbw m1, m0
+ psadbw m2, m0
+ paddd m6, m1
+ paddd m7, m2
+%endmacro
+
+; void aom_sadNxNx4d_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref[4], int ref_stride,
+; uint32_t res[4]);
+; Macro Arguments:
+; 1: Width
+; 2: Height
+; 3: If 0, then normal sad, else skip rows
+%macro SADNXN4D 2-3 0
+
+%define spill_src_stride 0
+%define spill_ref_stride 0
+%define spill_cnt 0
+
+; Whether a shared offset should be used instead of adding strides to
+; each reference array. With this option, only one line will be processed
+; per loop iteration.
+%define use_ref_offset (%1 >= mmsize)
+
+; Remove loops in the 4x4 and 8x4 case
+%define use_loop (use_ref_offset || %2 > 4)
+
+%if %3 == 1 ; skip rows
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4, cnt
+%else
+cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
+ ref2, ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
+ ref4
+%define spill_src_stride 1
+%define spill_ref_stride 1
+%elif use_loop
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
+ ref3, ref4
+%define spill_src_stride 1
+%else
+cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
+ ref3, ref4
+%endif
+%endif
+%else ; normal sad
+%if AOM_ARCH_X86_64
+%if use_ref_offset
+cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4, cnt, ref_offset
+%elif use_loop
+cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4, cnt
+%else
+cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
+ ref3, ref4
+%endif
+%else
+%if use_ref_offset
+cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
+ %define spill_src_stride 1
+ %define spill_ref_stride 1
+%elif use_loop
+cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
+ %define spill_src_stride 1
+%else
+cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
+ ref4
+%endif
+%endif
+%endif
+
+%if spill_src_stride
+ %define src_strideq r1mp
+ %define src_strided r1mp
+%endif
+%if spill_ref_stride
+ %define ref_strideq r3mp
+ %define ref_strided r3mp
+%endif
+
+%if spill_cnt
+ SUB rsp, 4
+ %define cntd word [rsp]
+%endif
+
+%if %3 == 1
+ sal src_strided, 1
+ sal ref_strided, 1
+%endif
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+
+ mov ref2q, [ref1q+gprsize*1]
+ mov ref3q, [ref1q+gprsize*2]
+ mov ref4q, [ref1q+gprsize*3]
+ mov ref1q, [ref1q+gprsize*0]
+
+; Is the loop for this wxh in another function?
+; If so, we jump into that function for the loop and returning
+%define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
+
+%if use_ref_offset
+ PROCESS_FIRST_MMSIZE
+%if %1 > mmsize
+ mov ref_offsetq, 0
+ mov cntd, %2 >> %3
+; Jump part way into the loop for the square version of this width
+%if %3 == 1
+ jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
+%else
+ jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
+%endif
+%else
+ mov ref_offsetq, ref_strideq
+ add srcq, src_strideq
+ mov cntd, (%2 >> %3) - 1
+%endif
+%if external_loop == 0
+.loop:
+; Unrolled horizontal loop
+%assign h_offset 0
+%rep %1/mmsize
+ PROCESS_16x1x4 h_offset
+%if h_offset == 0
+; The first row of the first column is done outside the loop and jumps here
+.midloop:
+%endif
+%assign h_offset h_offset+mmsize
+%endrep
+
+ add srcq, src_strideq
+ add ref_offsetq, ref_strideq
+ sub cntd, 1
+ jnz .loop
+%endif
+%else
+ PROCESS_%1x2x4 1
+ ADVANCE_END_OF_TWO_LINES
+%if use_loop
+ mov cntd, (%2/2 >> %3) - 1
+.loop:
+%endif
+ PROCESS_%1x2x4 0
+%if use_loop
+ ADVANCE_END_OF_TWO_LINES
+ sub cntd, 1
+ jnz .loop
+%endif
+%endif
+
+%if spill_cnt
+; Undo stack allocation for cnt
+ ADD rsp, 4
+%endif
+
+%if external_loop == 0
+%if %3 == 0
+ %define resultq r4
+ %define resultmp r4mp
+%endif
+
+; Undo modifications on parameters on the stack
+%if %3 == 1
+%if spill_src_stride
+ shr src_strided, 1
+%endif
+%if spill_ref_stride
+ shr ref_strided, 1
+%endif
+%endif
+
+%if %1 > 4
+ pslldq m5, 4
+ pslldq m7, 4
+ por m4, m5
+ por m6, m7
+ mova m5, m4
+ mova m7, m6
+ punpcklqdq m4, m6
+ punpckhqdq m5, m7
+ paddd m4, m5
+%if %3 == 1
+ pslld m4, 1
+%endif
+ movifnidn resultq, resultmp
+ movu [resultq], m4
+ RET
+%else
+ pshufd m6, m6, 0x08
+ pshufd m7, m7, 0x08
+%if %3 == 1
+ pslld m6, 1
+ pslld m7, 1
+%endif
+ movifnidn resultq, resultmp
+ movq [resultq+0], m6
+ movq [resultq+8], m7
+ RET
+%endif
+%endif ; external_loop == 0
+%endmacro
+
+INIT_XMM sse2
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64, 128
+SADNXN4D 64, 64
+SADNXN4D 64, 32
+SADNXN4D 32, 64
+SADNXN4D 32, 32
+SADNXN4D 32, 16
+SADNXN4D 16, 32
+SADNXN4D 16, 16
+SADNXN4D 16, 8
+SADNXN4D 8, 16
+SADNXN4D 8, 8
+SADNXN4D 8, 4
+SADNXN4D 4, 8
+SADNXN4D 4, 4
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D 4, 16
+SADNXN4D 16, 4
+SADNXN4D 8, 32
+SADNXN4D 32, 8
+SADNXN4D 16, 64
+SADNXN4D 64, 16
+%endif
+SADNXN4D 128, 128, 1
+SADNXN4D 128, 64, 1
+SADNXN4D 64, 128, 1
+SADNXN4D 64, 64, 1
+SADNXN4D 64, 32, 1
+SADNXN4D 32, 64, 1
+SADNXN4D 32, 32, 1
+SADNXN4D 32, 16, 1
+SADNXN4D 16, 32, 1
+SADNXN4D 16, 16, 1
+SADNXN4D 16, 8, 1
+SADNXN4D 8, 16, 1
+SADNXN4D 8, 8, 1
+SADNXN4D 4, 8, 1
+%if CONFIG_REALTIME_ONLY==0
+SADNXN4D 4, 16, 1
+SADNXN4D 8, 32, 1
+SADNXN4D 32, 8, 1
+SADNXN4D 16, 64, 1
+SADNXN4D 64, 16, 1
+%endif
+
+; Different assembly is needed when the height gets subsampled to 2
+; SADNXN4D 16, 4, 1
+; SADNXN4D 8, 4, 1
+; SADNXN4D 4, 4, 1
diff --git a/third_party/aom/aom_dsp/x86/sad_avx2.c b/third_party/aom/aom_dsp/x86/sad_avx2.c
new file mode 100644
index 0000000000..24cea76b37
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_avx2.c
@@ -0,0 +1,219 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+static INLINE unsigned int sad64xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+ _mm256_zeroupper();
+ return res;
+}
+
+static INLINE unsigned int sad32xh_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ int h) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ int ref2_stride = ref_stride << 1;
+ int src2_stride = src_stride << 1;
+ int max = h >> 1;
+ for (i = 0; i < max; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref2_stride;
+ src_ptr += src2_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+ _mm256_zeroupper();
+ return res;
+}
+
+#define FSAD64_H(h) \
+ unsigned int aom_sad64x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad64xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS64_H(h) \
+ unsigned int aom_sad_skip_64x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad64xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD32_H(h) \
+ unsigned int aom_sad32x##h##_avx2(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *ref_ptr, int ref_stride) { \
+ return sad32xh_avx2(src_ptr, src_stride, ref_ptr, ref_stride, h); \
+ }
+
+#define FSADS32_H(h) \
+ unsigned int aom_sad_skip_32x##h##_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return 2 * sad32xh_avx2(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2, \
+ h / 2); \
+ }
+
+#define FSAD64 \
+ FSAD64_H(64) \
+ FSAD64_H(32) \
+ FSADS64_H(64) \
+ FSADS64_H(32)
+
+#define FSAD32 \
+ FSAD32_H(64) \
+ FSAD32_H(32) \
+ FSAD32_H(16) \
+ FSADS32_H(64) \
+ FSADS32_H(32) \
+ FSADS32_H(16)
+
+/* clang-format off */
+FSAD64
+FSAD32
+/* clang-format on */
+
+#undef FSAD64
+#undef FSAD32
+#undef FSAD64_H
+#undef FSAD32_H
+
+#define FSADAVG64_H(h) \
+ unsigned int aom_sad64x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ for (i = 0; i < h; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref_stride; \
+ src_ptr += src_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG32_H(h) \
+ unsigned int aom_sad32x##h##_avg_avx2( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride, const uint8_t *second_pred) { \
+ int i; \
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \
+ __m256i sum_sad = _mm256_setzero_si256(); \
+ __m256i sum_sad_h; \
+ __m128i sum_sad128; \
+ int ref2_stride = ref_stride << 1; \
+ int src2_stride = src_stride << 1; \
+ int max = h >> 1; \
+ for (i = 0; i < max; i++) { \
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \
+ ref1_reg = _mm256_avg_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred)); \
+ ref2_reg = _mm256_avg_epu8( \
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32))); \
+ sad1_reg = _mm256_sad_epu8( \
+ ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr)); \
+ sad2_reg = _mm256_sad_epu8( \
+ ref2_reg, \
+ _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \
+ sum_sad = \
+ _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \
+ ref_ptr += ref2_stride; \
+ src_ptr += src2_stride; \
+ second_pred += 64; \
+ } \
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8); \
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \
+ unsigned int res = (unsigned int)_mm_cvtsi128_si32(sum_sad128); \
+ _mm256_zeroupper(); \
+ return res; \
+ }
+
+#define FSADAVG64 \
+ FSADAVG64_H(64) \
+ FSADAVG64_H(32)
+
+#define FSADAVG32 \
+ FSADAVG32_H(64) \
+ FSADAVG32_H(32) \
+ FSADAVG32_H(16)
+
+/* clang-format off */
+FSADAVG64
+FSADAVG32
+/* clang-format on */
+
+#undef FSADAVG64
+#undef FSADAVG32
+#undef FSADAVG64_H
+#undef FSADAVG32_H
diff --git a/third_party/aom/aom_dsp/x86/sad_impl_avx2.c b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
new file mode 100644
index 0000000000..c5da6e9ab3
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_impl_avx2.c
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ __m256i s1, s2, r1, r2;
+ __m256i sum = _mm256_setzero_si256();
+ __m128i sum_i128;
+ int i;
+
+ for (i = 0; i < 16; ++i) {
+ r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
+ s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
+ s2 = _mm256_sad_epu8(
+ r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
+ sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
+ ref_ptr += ref_stride << 1;
+ src_ptr += src_stride << 1;
+ }
+
+ sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
+ sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
+ _mm256_castsi256_si128(sum));
+ return (unsigned int)_mm_cvtsi128_si32(sum_i128);
+}
+
+static unsigned int sad64x32(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 32;
+ uint32_t sum = sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad32x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+static unsigned int sad64x64(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 5;
+ ref_ptr += ref_stride << 5;
+ sum += sad64x32(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad64x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += sad64x64(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ uint32_t sum = aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
+ return sum;
+}
+
+unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t half_width = 64;
+ uint32_t sum = sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ sum += sad64x32(src_ptr, src_stride * 2, ref_ptr, ref_stride * 2);
+ return 2 * sum;
+}
+
+unsigned int aom_sad_skip_64x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t sum =
+ sad64x64(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+ return 2 * sum;
+}
+
+unsigned int aom_sad_skip_128x128_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride) {
+ const uint32_t sum =
+ aom_sad128x64_avx2(src_ptr, 2 * src_stride, ref_ptr, 2 * ref_stride);
+ return 2 * sum;
+}
+
+static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const int h, const uint8_t *second_pred,
+ const int second_pred_stride) {
+ int i;
+ __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
+ __m256i sum_sad = _mm256_setzero_si256();
+ __m256i sum_sad_h;
+ __m128i sum_sad128;
+ for (i = 0; i < h; i++) {
+ ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
+ ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
+ ref1_reg = _mm256_avg_epu8(
+ ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
+ ref2_reg = _mm256_avg_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
+ sad1_reg =
+ _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
+ sad2_reg = _mm256_sad_epu8(
+ ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
+ sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
+ ref_ptr += ref_stride;
+ src_ptr += src_stride;
+ second_pred += second_pred_stride;
+ }
+ sum_sad_h = _mm256_srli_si256(sum_sad, 8);
+ sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
+ sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
+ sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
+ return (unsigned int)_mm_cvtsi128_si32(sum_sad128);
+}
+
+unsigned int aom_sad64x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 64 << 6;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 64);
+ return sum;
+}
+
+unsigned int aom_sad128x64_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ unsigned int half_width = 64;
+ uint32_t sum = sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ src_ptr += half_width;
+ ref_ptr += half_width;
+ second_pred += half_width;
+ sum += sad_w64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride, 64,
+ second_pred, 128);
+ return sum;
+}
+
+unsigned int aom_sad128x128_avg_avx2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ const uint8_t *second_pred) {
+ uint32_t sum = aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr,
+ ref_stride, second_pred);
+ src_ptr += src_stride << 6;
+ ref_ptr += ref_stride << 6;
+ second_pred += 128 << 6;
+ sum += aom_sad128x64_avg_avx2(src_ptr, src_stride, ref_ptr, ref_stride,
+ second_pred);
+ return sum;
+}
diff --git a/third_party/aom/aom_dsp/x86/sad_sse2.asm b/third_party/aom/aom_dsp/x86/sad_sse2.asm
new file mode 100644
index 0000000000..dbe8ca3161
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sad_sse2.asm
@@ -0,0 +1,432 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; Macro Arguments
+; Arg 1: Width
+; Arg 2: Height
+; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
+; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
+%macro SAD_FN 4
+%if %4 == 0 ; normal sad
+%if %3 == 5
+cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%elif %4 == 2 ; skip
+%if %3 == 5
+cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows
+%else ; %3 == 7
+cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \
+ src_stride3, ref_stride3, n_rows
+%endif ; %3 == 5/7
+
+%else
+%if %3 == 5
+cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \
+ second_pred, n_rows
+%else ; %3 == 7
+cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \
+ ref, ref_stride, \
+ second_pred, \
+ src_stride3, ref_stride3
+%if AOM_ARCH_X86_64
+%define n_rowsd r7d
+%else ; x86-32
+%define n_rowsd dword r0m
+%endif ; x86-32/64
+%endif ; %3 == 5/7
+%endif ; sad/avg/skip
+%if %4 == 2; skip rows so double the stride
+lea src_strided, [src_strided*2]
+lea ref_strided, [ref_strided*2]
+%endif ; %4 skip
+ movsxdifnidn src_strideq, src_strided
+ movsxdifnidn ref_strideq, ref_strided
+%if %3 == 7
+ lea src_stride3q, [src_strideq*3]
+ lea ref_stride3q, [ref_strideq*3]
+%endif ; %3 == 7
+%endmacro
+
+; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+ SAD_FN 128, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ movu m1, [refq+64]
+ movu m2, [refq+80]
+ movu m3, [refq+96]
+ movu m4, [refq+112]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*4]
+ pavgb m2, [second_predq+mmsize*5]
+ pavgb m3, [second_predq+mmsize*6]
+ pavgb m4, [second_predq+mmsize*7]
+ lea second_predq, [second_predq+mmsize*8]
+%endif
+ psadbw m1, [srcq+64]
+ psadbw m2, [srcq+80]
+ psadbw m3, [srcq+96]
+ psadbw m4, [srcq+112]
+
+ add refq, ref_strideq
+ add srcq, src_strideq
+
+ paddd m1, m2
+ paddd m3, m4
+ paddd m0, m1
+ paddd m0, m3
+
+ sub n_rowsd, 1
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128 ; sad128x128_sse2
+SAD128XN 128, 1 ; sad128x128_avg_sse2
+SAD128XN 128, 2 ; sad128x128_skip_sse2
+SAD128XN 64 ; sad128x64_sse2
+SAD128XN 64, 1 ; sad128x64_avg_sse2
+SAD128XN 64, 2 ; sad128x64_skip_sse2
+
+
+; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD64XN 1-2 0
+ SAD_FN 64, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/2
+%else
+ mov n_rowsd, %1
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+32]
+ movu m4, [refq+48]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+32]
+ psadbw m4, [srcq+48]
+ paddd m1, m2
+ paddd m3, m4
+ add refq, ref_strideq
+ paddd m0, m1
+ add srcq, src_strideq
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD64XN 128 ; sad64x128_sse2
+SAD64XN 64 ; sad64x64_sse2
+SAD64XN 32 ; sad64x32_sse2
+SAD64XN 16 ; sad64x16_sse2
+SAD64XN 128, 1 ; sad64x128_avg_sse2
+SAD64XN 64, 1 ; sad64x64_avg_sse2
+SAD64XN 32, 1 ; sad64x32_avg_sse2
+SAD64XN 16, 1 ; sad64x16_avg_sse2
+SAD64XN 128, 2 ; sad64x128_skip_sse2
+SAD64XN 64, 2 ; sad64x64_skip_sse2
+SAD64XN 32, 2 ; sad64x32_skip_sse2
+SAD64XN 16, 2 ; sad64x16_skip_sse2
+
+; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD32XN 1-2 0
+ SAD_FN 32, %1, 5, %2
+%if %2 == 2
+ mov n_rowsd, %1/4
+%else
+ mov n_rowsd, %1/2
+%endif
+ pxor m0, m0
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+16]
+ movu m3, [refq+ref_strideq]
+ movu m4, [refq+ref_strideq+16]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+16]
+ psadbw m3, [srcq+src_strideq]
+ psadbw m4, [srcq+src_strideq+16]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*2]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*2]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD32XN 64 ; sad32x64_sse2
+SAD32XN 32 ; sad32x32_sse2
+SAD32XN 16 ; sad32x16_sse2
+SAD32XN 8 ; sad_32x8_sse2
+SAD32XN 64, 1 ; sad32x64_avg_sse2
+SAD32XN 32, 1 ; sad32x32_avg_sse2
+SAD32XN 16, 1 ; sad32x16_avg_sse2
+SAD32XN 8, 1 ; sad_32x8_avg_sse2
+SAD32XN 64, 2 ; sad32x64_skip_sse2
+SAD32XN 32, 2 ; sad32x32_skip_sse2
+SAD32XN 16, 2 ; sad32x16_skip_sse2
+SAD32XN 8, 2 ; sad_32x8_skip_sse2
+
+; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD16XN 1-2 0
+ SAD_FN 16, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movu m1, [refq]
+ movu m2, [refq+ref_strideq]
+ movu m3, [refq+ref_strideq*2]
+ movu m4, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ pavgb m3, [second_predq+mmsize*2]
+ pavgb m4, [second_predq+mmsize*3]
+ lea second_predq, [second_predq+mmsize*4]
+%endif
+ psadbw m1, [srcq]
+ psadbw m2, [srcq+src_strideq]
+ psadbw m3, [srcq+src_strideq*2]
+ psadbw m4, [srcq+src_stride3q]
+ paddd m1, m2
+ paddd m3, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m3
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD16XN 64 ; sad_16x64_sse2
+SAD16XN 32 ; sad16x32_sse2
+SAD16XN 16 ; sad16x16_sse2
+SAD16XN 8 ; sad16x8_sse2
+SAD16XN 4 ; sad_16x4_sse2
+SAD16XN 64, 1 ; sad_16x64_avg_sse2
+SAD16XN 32, 1 ; sad16x32_avg_sse2
+SAD16XN 16, 1 ; sad16x16_avg_sse2
+SAD16XN 8, 1 ; sad16x8_avg_sse2
+SAD16XN 4, 1 ; sad_16x4_avg_sse2
+SAD16XN 64, 2 ; sad_16x64_skip_sse2
+SAD16XN 32, 2 ; sad16x32_skip_sse2
+SAD16XN 16, 2 ; sad16x16_skip_sse2
+SAD16XN 8, 2 ; sad16x8_skip_sse2
+
+; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD8XN 1-2 0
+ SAD_FN 8, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movh m1, [refq]
+ movhps m1, [refq+ref_strideq]
+ movh m2, [refq+ref_strideq*2]
+ movhps m2, [refq+ref_stride3q]
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ pavgb m2, [second_predq+mmsize*1]
+ lea second_predq, [second_predq+mmsize*2]
+%endif
+ movh m3, [srcq]
+ movhps m3, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movhps m4, [srcq+src_stride3q]
+ psadbw m1, m3
+ psadbw m2, m4
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ paddd m0, m2
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD8XN 32 ; sad_8x32_sse2
+SAD8XN 16 ; sad8x16_sse2
+SAD8XN 8 ; sad8x8_sse2
+SAD8XN 4 ; sad8x4_sse2
+SAD8XN 32, 1 ; sad_8x32_avg_sse2
+SAD8XN 16, 1 ; sad8x16_avg_sse2
+SAD8XN 8, 1 ; sad8x8_avg_sse2
+SAD8XN 4, 1 ; sad8x4_avg_sse2
+SAD8XN 32, 2 ; sad_8x32_skip_sse2
+SAD8XN 16, 2 ; sad8x16_skip_sse2
+SAD8XN 8, 2 ; sad8x8_skip_sse2
+
+; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride,
+; uint8_t *ref, int ref_stride);
+%macro SAD4XN 1-2 0
+ SAD_FN 4, %1, 7, %2
+%if %2 == 2
+ mov n_rowsd, %1/8
+%else
+ mov n_rowsd, %1/4
+%endif
+ pxor m0, m0
+
+.loop:
+ movd m1, [refq]
+ movd m2, [refq+ref_strideq]
+ movd m3, [refq+ref_strideq*2]
+ movd m4, [refq+ref_stride3q]
+ punpckldq m1, m2
+ punpckldq m3, m4
+ movlhps m1, m3
+%if %2 == 1
+ pavgb m1, [second_predq+mmsize*0]
+ lea second_predq, [second_predq+mmsize*1]
+%endif
+ movd m2, [srcq]
+ movd m5, [srcq+src_strideq]
+ movd m4, [srcq+src_strideq*2]
+ movd m3, [srcq+src_stride3q]
+ punpckldq m2, m5
+ punpckldq m4, m3
+ movlhps m2, m4
+ psadbw m1, m2
+ lea refq, [refq+ref_strideq*4]
+ paddd m0, m1
+ lea srcq, [srcq+src_strideq*4]
+ dec n_rowsd
+ jg .loop
+
+ movhlps m1, m0
+ paddd m0, m1
+%if %2 == 2 ; we skipped rows, so now we need to double the sad
+ pslld m0, 1
+%endif
+ movd eax, m0
+ RET
+%endmacro
+
+INIT_XMM sse2
+SAD4XN 16 ; sad_4x16_sse2
+SAD4XN 8 ; sad4x8_sse
+SAD4XN 4 ; sad4x4_sse
+SAD4XN 16, 1 ; sad_4x16_avg_sse2
+SAD4XN 8, 1 ; sad4x8_avg_sse
+SAD4XN 4, 1 ; sad4x4_avg_sse
+SAD4XN 16, 2 ; sad_4x16_skip_sse2
+SAD4XN 8, 2 ; sad4x8_skip_sse
diff --git a/third_party/aom/aom_dsp/x86/sse_avx2.c b/third_party/aom/aom_dsp/x86/sse_avx2.c
new file mode 100644
index 0000000000..c5a5f5c234
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_avx2.c
@@ -0,0 +1,389 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+
+static INLINE void sse_w32_avx2(__m256i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m256i v_a0 = yy_loadu_256(a);
+ const __m256i v_b0 = yy_loadu_256(b);
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i v_a00_w = _mm256_unpacklo_epi8(v_a0, zero);
+ const __m256i v_a01_w = _mm256_unpackhi_epi8(v_a0, zero);
+ const __m256i v_b00_w = _mm256_unpacklo_epi8(v_b0, zero);
+ const __m256i v_b01_w = _mm256_unpackhi_epi8(v_b0, zero);
+ const __m256i v_d00_w = _mm256_sub_epi16(v_a00_w, v_b00_w);
+ const __m256i v_d01_w = _mm256_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE int64_t summary_all_avx2(const __m256i *sum_all) {
+ int64_t sum;
+ __m256i zero = _mm256_setzero_si256();
+ const __m256i sum0_4x64 = _mm256_unpacklo_epi32(*sum_all, zero);
+ const __m256i sum1_4x64 = _mm256_unpackhi_epi32(*sum_all, zero);
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_avx2(const __m256i *sum32, __m256i *sum) {
+ const __m256i sum0_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_castsi256_si128(*sum32));
+ const __m256i sum1_4x64 =
+ _mm256_cvtepu32_epi64(_mm256_extracti128_si256(*sum32, 1));
+ const __m256i sum_4x64 = _mm256_add_epi64(sum0_4x64, sum1_4x64);
+ *sum = _mm256_add_epi64(*sum, sum_4x64);
+}
+
+static INLINE int64_t summary_4x64_avx2(const __m256i sum_4x64) {
+ int64_t sum;
+ const __m128i sum_2x64 = _mm_add_epi64(_mm256_castsi256_si128(sum_4x64),
+ _mm256_extracti128_si256(sum_4x64, 1));
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+#endif
+
+static INLINE void sse_w4x4_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_a2 = xx_loadl_32(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_32(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_b2 = xx_loadl_32(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_32(b + b_stride * 3);
+ const __m128i v_a0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_a0, v_a1),
+ _mm_unpacklo_epi32(v_a2, v_a3));
+ const __m128i v_b0123 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(v_b0, v_b1),
+ _mm_unpacklo_epi32(v_b2, v_b3));
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(v_a0123);
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(v_b0123);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse_w8x2_avx2(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m256i *sum) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m256i v_a_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_a0, v_a1));
+ const __m256i v_b_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(v_b0, v_b1));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_avx2(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ __m256i sum = _mm256_setzero_si256();
+ __m256i zero = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ sse_w4x4_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ sse_w8x2_avx2(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_a1 = xx_loadu_128(a + a_stride);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_b1 = xx_loadu_128(b + b_stride);
+ const __m256i v_a =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_a0), v_a1, 0x01);
+ const __m256i v_b =
+ _mm256_insertf128_si256(_mm256_castsi128_si256(v_b0), v_b1, 0x01);
+ const __m256i v_al = _mm256_unpacklo_epi8(v_a, zero);
+ const __m256i v_au = _mm256_unpackhi_epi8(v_a, zero);
+ const __m256i v_bl = _mm256_unpacklo_epi8(v_b, zero);
+ const __m256i v_bu = _mm256_unpackhi_epi8(v_b, zero);
+ const __m256i v_asub = _mm256_sub_epi16(v_al, v_bl);
+ const __m256i v_bsub = _mm256_sub_epi16(v_au, v_bu);
+ const __m256i temp =
+ _mm256_add_epi32(_mm256_madd_epi16(v_asub, v_asub),
+ _mm256_madd_epi16(v_bsub, v_bsub));
+ sum = _mm256_add_epi32(sum, temp);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 64:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 128:
+ do {
+ sse_w32_avx2(&sum, a, b);
+ sse_w32_avx2(&sum, a + 32, b + 32);
+ sse_w32_avx2(&sum, a + 64, b + 64);
+ sse_w32_avx2(&sum, a + 96, b + 96);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ default:
+ if ((width & 0x07) == 0) {
+ do {
+ int i = 0;
+ do {
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ sse_w8x2_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ const uint8_t *a2 = a + i + (a_stride << 1);
+ const uint8_t *b2 = b + i + (b_stride << 1);
+ sse_w8x2_avx2(a2, a_stride, b2, b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse_w4x4_avx2(a + i, a_stride, b + i, b_stride, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ }
+ sse = summary_all_avx2(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w16_avx2(__m256i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m256i v_a_w = yy_loadu_256(a);
+ const __m256i v_b_w = yy_loadu_256(b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w4x4_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_a2 = xx_loadl_64(a + a_stride * 2);
+ const __m128i v_a3 = xx_loadl_64(a + a_stride * 3);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_b2 = xx_loadl_64(b + b_stride * 2);
+ const __m128i v_b3 = xx_loadl_64(b + b_stride * 3);
+ const __m256i v_a_w = yy_set_m128i(_mm_unpacklo_epi64(v_a0, v_a1),
+ _mm_unpacklo_epi64(v_a2, v_a3));
+ const __m256i v_b_w = yy_set_m128i(_mm_unpacklo_epi64(v_b0, v_b1),
+ _mm_unpacklo_epi64(v_b2, v_b3));
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8x2_avx2(__m256i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m256i v_a_w = yy_loadu2_128(a + a_stride, a);
+ const __m256i v_b_w = yy_loadu2_128(b + b_stride, b);
+ const __m256i v_d_w = _mm256_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm256_add_epi32(*sum, _mm256_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_avx2(const uint8_t *a8, int a_stride, const uint8_t *b8,
+ int b_stride, int width, int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m256i sum = _mm256_setzero_si256();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x4_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8x2_avx2(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 16:
+ do {
+ highbd_sse_w16_avx2(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_avx2(&sum);
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ case 128:
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w16_avx2(&sum32, a, b);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 1, b + 16 * 1);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 2, b + 16 * 2);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 3, b + 16 * 3);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 4, b + 16 * 4);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 5, b + 16 * 5);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 6, b + 16 * 6);
+ highbd_sse_w16_avx2(&sum32, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 16 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 16;
+ } while (y < height);
+ sse = summary_4x64_avx2(sum);
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ int i = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ const uint16_t *a2 = a + i + (a_stride << 1);
+ const uint16_t *b2 = b + i + (b_stride << 1);
+ highbd_sse_w8x2_avx2(&sum32, a2, a_stride, b2, b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x4_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ summary_32_avx2(&sum32, &sum);
+ a += a_stride << 2;
+ b += b_stride << 2;
+ y += 4;
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m256i sum32 = _mm256_setzero_si256();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8x2_avx2(&sum32, a + i, a_stride, b + i, b_stride);
+ i += 8;
+ } while (i < width);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ l += 2;
+ } while (l < 8 && l < (height - y));
+ summary_32_avx2(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ sse = summary_4x64_avx2(sum);
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/sse_sse4.c b/third_party/aom/aom_dsp/x86/sse_sse4.c
new file mode 100644
index 0000000000..7e74554d75
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sse_sse4.c
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE int64_t summary_all_sse4(const __m128i *sum_all) {
+ int64_t sum;
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8));
+ const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1);
+ const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8));
+ xx_storel_64(&sum, sum_1x64);
+ return sum;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void summary_32_sse4(const __m128i *sum32, __m128i *sum64) {
+ const __m128i sum0 = _mm_cvtepu32_epi64(*sum32);
+ const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8));
+ *sum64 = _mm_add_epi64(sum0, *sum64);
+ *sum64 = _mm_add_epi64(sum1, *sum64);
+}
+#endif
+
+static INLINE void sse_w16_sse4_1(__m128i *sum, const uint8_t *a,
+ const uint8_t *b) {
+ const __m128i v_a0 = xx_loadu_128(a);
+ const __m128i v_b0 = xx_loadu_128(b);
+ const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8));
+ const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8));
+ const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w);
+ const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w));
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w));
+}
+
+static INLINE void sse4x2_sse4_1(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride, __m128i *sum) {
+ const __m128i v_a0 = xx_loadl_32(a);
+ const __m128i v_a1 = xx_loadl_32(a + a_stride);
+ const __m128i v_b0 = xx_loadl_32(b);
+ const __m128i v_b1 = xx_loadl_32(b + b_stride);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1));
+ const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1));
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void sse8_sse4_1(const uint8_t *a, const uint8_t *b,
+ __m128i *sum) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0);
+ const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int width, int height) {
+ int y = 0;
+ int64_t sse = 0;
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ sse4x2_sse4_1(a, a_stride, b, b_stride, &sum);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ sse8_sse4_1(a, b, &sum);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 32:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16, b + 16);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 64:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 128:
+ do {
+ sse_w16_sse4_1(&sum, a, b);
+ sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1);
+ sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2);
+ sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3);
+ sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4);
+ sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5);
+ sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6);
+ sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ default:
+ if (width & 0x07) {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum);
+ i += 8;
+ } while (i + 4 < width);
+ sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ } while (y < height);
+ } else {
+ do {
+ int i = 0;
+ do {
+ sse8_sse4_1(a + i, b + i, &sum);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ }
+ sse = summary_all_sse4(&sum);
+ break;
+ }
+
+ return sse;
+}
+
+#if CONFIG_AV1_HIGHBITDEPTH
+static INLINE void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a,
+ int a_stride, const uint16_t *b,
+ int b_stride) {
+ const __m128i v_a0 = xx_loadl_64(a);
+ const __m128i v_a1 = xx_loadl_64(a + a_stride);
+ const __m128i v_b0 = xx_loadl_64(b);
+ const __m128i v_b1 = xx_loadl_64(b + b_stride);
+ const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1);
+ const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+static INLINE void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a,
+ const uint16_t *b) {
+ const __m128i v_a_w = xx_loadu_128(a);
+ const __m128i v_b_w = xx_loadu_128(b);
+ const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+ *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w));
+}
+
+int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride,
+ const uint8_t *b8, int b_stride, int width,
+ int height) {
+ int32_t y = 0;
+ int64_t sse = 0;
+ uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+ uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+ __m128i sum = _mm_setzero_si128();
+ switch (width) {
+ case 4:
+ do {
+ highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride);
+ a += a_stride << 1;
+ b += b_stride << 1;
+ y += 2;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 8:
+ do {
+ highbd_sse_w8_sse4_1(&sum, a, b);
+ a += a_stride;
+ b += b_stride;
+ y += 1;
+ } while (y < height);
+ sse = summary_all_sse4(&sum);
+ break;
+ case 16:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 64 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 64;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 32:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 32 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 32;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 64:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 16 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 16;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ case 128:
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a, b);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14);
+ highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 8 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ default:
+ if (width & 0x7) {
+ do {
+ __m128i sum32 = _mm_setzero_si128();
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride);
+ i += 8;
+ } while (i + 4 < width);
+ highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride);
+ a += (a_stride << 1);
+ b += (b_stride << 1);
+ y += 2;
+ summary_32_sse4(&sum32, &sum);
+ } while (y < height);
+ } else {
+ do {
+ int l = 0;
+ __m128i sum32 = _mm_setzero_si128();
+ do {
+ int i = 0;
+ do {
+ highbd_sse_w8_sse4_1(&sum32, a + i, b + i);
+ i += 8;
+ } while (i < width);
+ a += a_stride;
+ b += b_stride;
+ l += 1;
+ } while (l < 8 && l < (height - y));
+ summary_32_sse4(&sum32, &sum);
+ y += 8;
+ } while (y < height);
+ }
+ xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8)));
+ break;
+ }
+ return sse;
+}
+#endif // CONFIG_AV1_HIGHBITDEPTH
diff --git a/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
new file mode 100644
index 0000000000..49bc655336
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/ssim_sse2_x86_64.asm
@@ -0,0 +1,222 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "aom_ports/x86_abi_support.asm"
+
+; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr
+%macro TABULATE_SSIM 0
+ paddusw xmm15, xmm3 ; sum_s
+ paddusw xmm14, xmm4 ; sum_r
+ movdqa xmm1, xmm3
+ pmaddwd xmm1, xmm1
+ paddd xmm13, xmm1 ; sum_sq_s
+ movdqa xmm2, xmm4
+ pmaddwd xmm2, xmm2
+ paddd xmm12, xmm2 ; sum_sq_r
+ pmaddwd xmm3, xmm4
+ paddd xmm11, xmm3 ; sum_sxr
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_Q 1
+ movdqa xmm2,%1
+ punpckldq %1,xmm0
+ punpckhdq xmm2,xmm0
+ paddq %1,xmm2
+ movdqa xmm2,%1
+ punpcklqdq %1,xmm0
+ punpckhqdq xmm2,xmm0
+ paddq %1,xmm2
+%endmacro
+
+; Sum across the register %1 starting with q words
+%macro SUM_ACROSS_W 1
+ movdqa xmm1, %1
+ punpcklwd %1,xmm0
+ punpckhwd xmm1,xmm0
+ paddd %1, xmm1
+ SUM_ACROSS_Q %1
+%endmacro
+
+SECTION .text
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_16x16_sse2)
+sym(aom_ssim_parms_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 16 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movdqu xmm5, [rsi]
+ movdqu xmm6, [rdi]
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpckhbw xmm3, xmm0 ; high_s
+ punpckhbw xmm4, xmm0 ; high_r
+
+ TABULATE_SSIM
+
+ movdqa xmm3, xmm5
+ movdqa xmm4, xmm6
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void ssim_parms_sse2(
+; unsigned char *s,
+; int sp,
+; unsigned char *r,
+; int rp
+; uint32_t *sum_s,
+; uint32_t *sum_r,
+; uint32_t *sum_sq_s,
+; uint32_t *sum_sq_r,
+; uint32_t *sum_sxr);
+;
+; TODO: Use parm passing through structure, probably don't need the pxors
+; ( calling app will initialize to 0 ) could easily fit everything in sse2
+; without too much hastle, and can probably do better estimates with psadw
+; or pavgb At this point this is just meant to be first pass for calculating
+; all the parms needed for 16x16 ssim so we can play with dssim as distortion
+; in mode selection code.
+globalsym(aom_ssim_parms_8x8_sse2)
+sym(aom_ssim_parms_8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 9
+ SAVE_XMM 15
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rsi, arg(0) ;s
+ mov rcx, arg(1) ;sp
+ mov rdi, arg(2) ;r
+ mov rax, arg(3) ;rp
+
+ pxor xmm0, xmm0
+ pxor xmm15,xmm15 ;sum_s
+ pxor xmm14,xmm14 ;sum_r
+ pxor xmm13,xmm13 ;sum_sq_s
+ pxor xmm12,xmm12 ;sum_sq_r
+ pxor xmm11,xmm11 ;sum_sxr
+
+ mov rdx, 8 ;row counter
+.NextRow:
+
+ ;grab source and reference pixels
+ movq xmm3, [rsi]
+ movq xmm4, [rdi]
+ punpcklbw xmm3, xmm0 ; low_s
+ punpcklbw xmm4, xmm0 ; low_r
+
+ TABULATE_SSIM
+
+ add rsi, rcx ; next s row
+ add rdi, rax ; next r row
+
+ dec rdx ; counter
+ jnz .NextRow
+
+ SUM_ACROSS_W xmm15
+ SUM_ACROSS_W xmm14
+ SUM_ACROSS_Q xmm13
+ SUM_ACROSS_Q xmm12
+ SUM_ACROSS_Q xmm11
+
+ mov rdi,arg(4)
+ movd [rdi], xmm15;
+ mov rdi,arg(5)
+ movd [rdi], xmm14;
+ mov rdi,arg(6)
+ movd [rdi], xmm13;
+ mov rdi,arg(7)
+ movd [rdi], xmm12;
+ mov rdi,arg(8)
+ movd [rdi], xmm11;
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
diff --git a/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
new file mode 100644
index 0000000000..d1d8373456
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subpel_variance_sse2.asm
@@ -0,0 +1,1470 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 10
+ times 8 dw 6
+ times 16 dw 8
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 2
+ times 8 dw 14
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 14, 2
+ times 8 db 12, 4
+ times 8 db 10, 6
+ times 16 db 8
+ times 8 db 6, 10
+ times 8 db 4, 12
+ times 8 db 2, 14
+
+SECTION .text
+
+; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 1
+%if %1 > 4
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%else ; 4xh
+ pshuflw m4, m6, 0xe
+ pshuflw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshuflw m4, m6, 0xe
+ paddd m6, m4
+ movd raxd, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro INC_SRC_BY_SRC_STRIDE 0
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+ add srcq, src_stridemp
+%else
+ add srcq, src_strideq
+%endif
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+
+%if AOM_ARCH_X86_64
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define sec_str sec_strideq
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %endif
+ %define block_height heightd
+ %define bilin_filter sseq
+%else
+ %if CONFIG_PIC=1
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ sec, sec_stride, height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+
+ ; reuse argument stack space
+ %define g_bilin_filterm x_offsetm
+ %define g_pw_8m y_offsetm
+
+ ;Store bilin_filter and pw_8 location in stack
+ %if GET_GOT_DEFINED == 1
+ GET_GOT eax
+ add esp, 4 ; restore esp
+ %endif
+
+ lea ecx, [GLOBAL(bilin_filter_m)]
+ mov g_bilin_filterm, ecx
+
+ lea ecx, [GLOBAL(pw_8)]
+ mov g_pw_8m, ecx
+
+ LOAD_IF_USED 0, 1 ; load eax, ecx back
+ %else
+ %if %2 == 1 ; avg
+ cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, sec, sec_stride, \
+ height, sse
+ %define block_height dword heightm
+ %define sec_str sec_stridemp
+ %else
+ cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \
+ x_offset, y_offset, dst, dst_stride, \
+ height, sse
+ %define block_height heightd
+ %endif
+ %define bilin_filter bilin_filter_m
+ %endif
+%endif
+
+%if %1 == 4
+ %define movx movd
+%else
+ %define movx movh
+%endif
+
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar block_height, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+%endif
+
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+
+%if %2 == 1 ; avg
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq*2]
+%else ; 4xh
+ movx m1, [srcq+src_strideq*2]
+ punpckldq m2, m1
+%endif
+ movx m1, [dstq]
+%if %1 > 4
+ movlhps m0, m2
+%else ; 4xh
+ punpckldq m0, m2
+%endif
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m4, [secq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq*2]
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_half_loop
+ STORE_AND_RET %1
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0, reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq*2]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonzero:
+ cmp x_offsetd, 4
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m4, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; 4xh
+ movx m1, [srcq+src_strideq]
+ punpckldq m0, m1
+ movx m2, [srcq+src_strideq+1]
+ punpckldq m4, m2
+%endif
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+%if %1 > 4
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; 4xh
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m2, [srcq+src_strideq]
+ movx m1, [dstq]
+ pavgb m0, m4
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_zero_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movx m2, [srcq]
+ movx m3, [srcq+1]
+%if %2 == 1 ; avg
+%if %1 > 4
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ movx m1, [srcq+src_strideq]
+ punpckldq m2, m1
+ movx m1, [srcq+src_strideq+1]
+ punpckldq m3, m1
+%endif
+ pavgb m2, m3
+%if %1 > 4
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; 4xh
+ punpckldq m0, m2
+ pshuflw m4, m2, 0xe
+%endif
+ movx m1, [dstq]
+ pavgb m0, m2
+ movx m3, [dstq+dst_strideq]
+%if %1 > 4
+ pavgb m0, [secq]
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%if %1 > 4
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%else ; !avg
+ movx m4, [srcq+src_strideq]
+ movx m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_half_loop
+ STORE_AND_RET %1
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ;x86_32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; x_offset == 0.5. We can reuse x_offset reg
+%define tempq x_offsetq
+ add y_offsetq, g_bilin_filterm
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movx m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+;y_offset == 0. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+ movx m2, [srcq+src_strideq]
+ movx m4, [srcq+src_strideq+1]
+ movx m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movx m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movx m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 4
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; y_offset == 0.5. We can reuse y_offset reg.
+%define tempq y_offsetq
+ add x_offsetq, g_bilin_filterm
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+ movx m4, [srcq+src_strideq]
+ movx m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m1, [dstq]
+ movx m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movx m1, [dstq]
+ paddw m4, m3
+ movx m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET %1
+
+.x_nonhalf_y_nonhalf:
+%if AOM_ARCH_X86_64
+ lea bilin_filter, [GLOBAL(bilin_filter_m)]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if AOM_ARCH_X86_64 && %1 > 4
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [GLOBAL(pw_8)]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else ; x86-32
+%if AOM_ARCH_X86=1 && CONFIG_PIC=1
+; In this case, there is NO unused register. Used src_stride register. Later,
+; src_stride has to be loaded from stack when it is needed.
+%define tempq src_strideq
+ mov tempq, g_bilin_filterm
+ add x_offsetq, tempq
+ add y_offsetq, tempq
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+
+ mov tempq, g_pw_8m
+%define filter_rnd [tempq]
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [GLOBAL(pw_8)]
+%endif
+%endif
+
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+
+ INC_SRC_BY_SRC_STRIDE
+
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movx m0, [srcq]
+ movx m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+
+ INC_SRC_BY_SRC_STRIDE
+
+.x_other_y_other_loop:
+ movx m2, [srcq]
+ movx m1, [srcq+1]
+
+ INC_SRC_BY_SRC_STRIDE
+ movx m4, [srcq]
+ movx m3, [srcq+1]
+
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movx m3, [dstq+dst_strideq]
+ movx m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movx m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movx m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+%if %1 == 4
+ movlhps m0, m2
+%endif
+ packuswb m0, m2
+%if %1 > 4
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ movh m2, [secq]
+ pavgb m0, m2
+ punpcklbw m0, m5
+ movhlps m2, m0
+%endif
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ INC_SRC_BY_SRC_STRIDE
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec block_height
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+%undef movx
+ STORE_AND_RET %1
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_XMM sse2
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_XMM ssse3
+SUBPEL_VARIANCE 4, 1
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/third_party/aom/aom_dsp/x86/subtract_avx2.c b/third_party/aom/aom_dsp/x86/subtract_avx2.c
new file mode 100644
index 0000000000..b4c5cc7c7b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_avx2.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE void subtract32_avx2(int16_t *diff_ptr, const uint8_t *src_ptr,
+ const uint8_t *pred_ptr) {
+ __m256i s = _mm256_lddqu_si256((__m256i *)(src_ptr));
+ __m256i p = _mm256_lddqu_si256((__m256i *)(pred_ptr));
+ __m256i set_one_minusone = _mm256_set1_epi32((int)0xff01ff01);
+ __m256i diff0 = _mm256_unpacklo_epi8(s, p);
+ __m256i diff1 = _mm256_unpackhi_epi8(s, p);
+ diff0 = _mm256_maddubs_epi16(diff0, set_one_minusone);
+ diff1 = _mm256_maddubs_epi16(diff1, set_one_minusone);
+ _mm256_store_si256((__m256i *)(diff_ptr),
+ _mm256_permute2x128_si256(diff0, diff1, 0x20));
+ _mm256_store_si256((__m256i *)(diff_ptr + 16),
+ _mm256_permute2x128_si256(diff0, diff1, 0x31));
+}
+
+static INLINE void subtract_block_16xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ __m128i s = _mm_lddqu_si128((__m128i *)(src_ptr));
+ __m128i p = _mm_lddqu_si128((__m128i *)(pred_ptr));
+ __m256i s_0 = _mm256_cvtepu8_epi16(s);
+ __m256i p_0 = _mm256_cvtepu8_epi16(p);
+ const __m256i d_0 = _mm256_sub_epi16(s_0, p_0);
+ _mm256_store_si256((__m256i *)(diff_ptr), d_0);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_32xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_64xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+static INLINE void subtract_block_128xn_avx2(
+ int rows, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
+ for (int32_t j = 0; j < rows; ++j) {
+ subtract32_avx2(diff_ptr, src_ptr, pred_ptr);
+ subtract32_avx2(diff_ptr + 32, src_ptr + 32, pred_ptr + 32);
+ subtract32_avx2(diff_ptr + 64, src_ptr + 64, pred_ptr + 64);
+ subtract32_avx2(diff_ptr + 96, src_ptr + 96, pred_ptr + 96);
+ src_ptr += src_stride;
+ pred_ptr += pred_stride;
+ diff_ptr += diff_stride;
+ }
+}
+
+void aom_subtract_block_avx2(int rows, int cols, int16_t *diff_ptr,
+ ptrdiff_t diff_stride, const uint8_t *src_ptr,
+ ptrdiff_t src_stride, const uint8_t *pred_ptr,
+ ptrdiff_t pred_stride) {
+ switch (cols) {
+ case 16:
+ subtract_block_16xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 32:
+ subtract_block_32xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 64:
+ subtract_block_64xn_avx2(rows, diff_ptr, diff_stride, src_ptr, src_stride,
+ pred_ptr, pred_stride);
+ break;
+ case 128:
+ subtract_block_128xn_avx2(rows, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ default:
+ aom_subtract_block_sse2(rows, cols, diff_ptr, diff_stride, src_ptr,
+ src_stride, pred_ptr, pred_stride);
+ break;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/subtract_sse2.asm b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
new file mode 100644
index 0000000000..fd508c0916
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/subtract_sse2.asm
@@ -0,0 +1,147 @@
+;
+; Copyright (c) 2016, Alliance for Open Media. All rights reserved
+;
+; This source code is subject to the terms of the BSD 2 Clause License and
+; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+; was not distributed with this source code in the LICENSE file, you can
+; obtain it at www.aomedia.org/license/software. If the Alliance for Open
+; Media Patent License 1.0 was not distributed with this source code in the
+; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+;
+
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void aom_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+ cmp colsd, 64
+ je .case_64
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ movu m1, [predq+%3]
+ movu m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_128:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize, 8*mmsize, 10*mmsize
+ loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ sub rowsd, 1
+ jnz .loop_128
+ RET
+
+.case_64:
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ emms
+ RET
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_avx2.c b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
new file mode 100644
index 0000000000..89b9b824bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_avx2.c
@@ -0,0 +1,326 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+#include <smmintrin.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/synonyms_avx2.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static uint64_t aom_sum_squares_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height) {
+ uint64_t result;
+ __m256i v_acc_q = _mm256_setzero_si256();
+ const __m256i v_zext_mask_q = yy_set1_64_from_32i(~0);
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_acc_d = _mm256_setzero_si256();
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m256i v_sum_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sum_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sum_0123_d = _mm256_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ v_acc_d = _mm256_add_epi32(v_acc_d, v_sum_0123_d);
+ }
+ v_acc_q =
+ _mm256_add_epi64(v_acc_q, _mm256_and_si256(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm256_add_epi64(v_acc_q, _mm256_srli_epi64(v_acc_d, 32));
+ src += 4 * stride;
+ }
+ __m128i lower_64_2_Value = _mm256_castsi256_si128(v_acc_q);
+ __m128i higher_64_2_Value = _mm256_extracti128_si256(v_acc_q, 1);
+ __m128i result_64_2_int = _mm_add_epi64(lower_64_2_Value, higher_64_2_Value);
+
+ result_64_2_int = _mm_add_epi64(
+ result_64_2_int, _mm_unpackhi_epi64(result_64_2_int, result_64_2_int));
+
+ xx_storel_64(&result, result_64_2_int);
+
+ return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_avx2(const int16_t *src, int stride, int width,
+ int height) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_squares_2d_i16_nxn_avx2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+static uint64_t aom_sum_sse_2d_i16_nxn_avx2(const int16_t *src, int stride,
+ int width, int height, int *sum) {
+ uint64_t result;
+ const __m256i zero_reg = _mm256_setzero_si256();
+ const __m256i one_reg = _mm256_set1_epi16(1);
+
+ __m256i v_sse_total = zero_reg;
+ __m256i v_sum_total = zero_reg;
+
+ for (int col = 0; col < height; col += 4) {
+ __m256i v_sse_row = zero_reg;
+ for (int row = 0; row < width; row += 16) {
+ const int16_t *tempsrc = src + row;
+ const __m256i v_val_0_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 0 * stride));
+ const __m256i v_val_1_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 1 * stride));
+ const __m256i v_val_2_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 2 * stride));
+ const __m256i v_val_3_w =
+ _mm256_loadu_si256((const __m256i *)(tempsrc + 3 * stride));
+
+ const __m256i v_sum_01 = _mm256_add_epi16(v_val_0_w, v_val_1_w);
+ const __m256i v_sum_23 = _mm256_add_epi16(v_val_2_w, v_val_3_w);
+ __m256i v_sum_0123 = _mm256_add_epi16(v_sum_01, v_sum_23);
+ v_sum_0123 = _mm256_madd_epi16(v_sum_0123, one_reg);
+ v_sum_total = _mm256_add_epi32(v_sum_total, v_sum_0123);
+
+ const __m256i v_sq_0_d = _mm256_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m256i v_sq_1_d = _mm256_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m256i v_sq_2_d = _mm256_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m256i v_sq_3_d = _mm256_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m256i v_sq_01_d = _mm256_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m256i v_sq_23_d = _mm256_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m256i v_sq_0123_d = _mm256_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sse_row = _mm256_add_epi32(v_sse_row, v_sq_0123_d);
+ }
+ const __m256i v_sse_row_low = _mm256_unpacklo_epi32(v_sse_row, zero_reg);
+ const __m256i v_sse_row_hi = _mm256_unpackhi_epi32(v_sse_row, zero_reg);
+ v_sse_row = _mm256_add_epi64(v_sse_row_low, v_sse_row_hi);
+ v_sse_total = _mm256_add_epi64(v_sse_total, v_sse_row);
+ src += 4 * stride;
+ }
+
+ const __m128i v_sum_total_low = _mm256_castsi256_si128(v_sum_total);
+ const __m128i v_sum_total_hi = _mm256_extracti128_si256(v_sum_total, 1);
+ __m128i sum_128bit = _mm_add_epi32(v_sum_total_hi, v_sum_total_low);
+ sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 8));
+ sum_128bit = _mm_add_epi32(sum_128bit, _mm_srli_si128(sum_128bit, 4));
+ *sum += _mm_cvtsi128_si32(sum_128bit);
+
+ __m128i v_sse_total_lo = _mm256_castsi256_si128(v_sse_total);
+ __m128i v_sse_total_hi = _mm256_extracti128_si256(v_sse_total, 1);
+ __m128i sse_128bit = _mm_add_epi64(v_sse_total_lo, v_sse_total_hi);
+
+ sse_128bit =
+ _mm_add_epi64(sse_128bit, _mm_unpackhi_epi64(sse_128bit, sse_128bit));
+
+ xx_storel_64(&result, sse_128bit);
+
+ return result;
+}
+
+uint64_t aom_sum_sse_2d_i16_avx2(const int16_t *src, int src_stride, int width,
+ int height, int *sum) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+ } else if (LIKELY(width == 8 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+ } else if (LIKELY(((width & 15) == 0) && ((height & 3) == 0))) {
+ return aom_sum_sse_2d_i16_nxn_avx2(src, src_stride, width, height, sum);
+ } else {
+ return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+ }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi16(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 2);
+ vtmp1 = _mm_add_epi16(vtmp1, vtmp2);
+ return _mm_extract_epi16(vtmp1, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm256_accumulate_epi32(__m256i vec_a) {
+ __m128i vtmp1 = _mm256_extracti128_si256(vec_a, 1);
+ __m128i vtmp2 = _mm256_castsi256_si128(vec_a);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 8);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ vtmp2 = _mm_srli_si128(vtmp1, 4);
+ vtmp1 = _mm_add_epi32(vtmp1, vtmp2);
+ return _mm_cvtsi128_si32(vtmp1);
+}
+
+uint64_t aom_var_2d_u8_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 32 elements in a row
+ for (i = 0; i < width - 31; i += 32) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi8(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm256_add_epi16(v_acc_sum, vsrc1);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc0, vsrc0);
+ __m256i vsqs1 = _mm256_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi16(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_avx2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m256i vzero = _mm256_setzero_si256();
+ __m256i v_acc_sum = vzero;
+ __m256i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m256i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm256_loadu_si256((__m256i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc[k], vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m256i vsrc = _mm256_loadu_si256((__m256i *)srcp);
+ __m256i vsrc0 = _mm256_unpacklo_epi16(vsrc, vzero);
+ __m256i vsrc1 = _mm256_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm256_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm256_add_epi32(vsrc1, v_acc_sum);
+
+ __m256i vsqs0 = _mm256_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm256_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm256_accumulate_epi32(v_acc_sum);
+ ss += mm256_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.c b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000000..cf3ed98974
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,478 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_dsp/x86/sum_squares_sse2.h"
+#include "config/aom_dsp_rtcd.h"
+
+static INLINE __m128i xx_loadh_64(__m128i a, const void *b) {
+ const __m128d ad = _mm_castsi128_pd(a);
+ return _mm_castpd_si128(_mm_loadh_pd(ad, (double *)b));
+}
+
+static INLINE uint64_t xx_cvtsi128_si64(__m128i a) {
+#if AOM_ARCH_X86_64
+ return (uint64_t)_mm_cvtsi128_si64(a);
+#else
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, a);
+ return tmp;
+ }
+#endif
+}
+
+static INLINE __m128i sum_squares_i16_4x4_sse2(const int16_t *src, int stride) {
+ const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+ const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+ const __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+ const __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+ const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+ const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+
+ return _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride) {
+ const __m128i v_sum_0123_d = sum_squares_i16_4x4_sse2(src, stride);
+ __m128i v_sum_d =
+ _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
+ v_sum_d = _mm_add_epi32(v_sum_d, _mm_srli_si128(v_sum_d, 8));
+ return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum) {
+ const __m128i one_reg = _mm_set1_epi16(1);
+ const __m128i v_val_0_w = xx_loadl_64(src + 0 * stride);
+ const __m128i v_val_2_w = xx_loadl_64(src + 2 * stride);
+ __m128i v_val_01_w = xx_loadh_64(v_val_0_w, src + 1 * stride);
+ __m128i v_val_23_w = xx_loadh_64(v_val_2_w, src + 3 * stride);
+
+ __m128i v_sum_0123_d = _mm_add_epi16(v_val_01_w, v_val_23_w);
+ v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+ v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 8));
+ v_sum_0123_d = _mm_add_epi32(v_sum_0123_d, _mm_srli_si128(v_sum_0123_d, 4));
+ *sum = _mm_cvtsi128_si32(v_sum_0123_d);
+
+ const __m128i v_sq_01_d = _mm_madd_epi16(v_val_01_w, v_val_01_w);
+ const __m128i v_sq_23_d = _mm_madd_epi16(v_val_23_w, v_val_23_w);
+ __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 8));
+ v_sq_0123_d = _mm_add_epi32(v_sq_0123_d, _mm_srli_si128(v_sq_0123_d, 4));
+ return (uint64_t)_mm_cvtsi128_si32(v_sq_0123_d);
+}
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height) {
+ int r = 0;
+ __m128i v_acc_q = _mm_setzero_si128();
+ do {
+ const __m128i v_acc_d = sum_squares_i16_4x4_sse2(src, stride);
+ v_acc_q = _mm_add_epi32(v_acc_q, v_acc_d);
+ src += stride << 2;
+ r += 4;
+ } while (r < height);
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc_64 = _mm_add_epi64(_mm_srli_epi64(v_acc_q, 32),
+ _mm_and_si128(v_acc_q, v_zext_mask_q));
+ v_acc_64 = _mm_add_epi64(v_acc_64, _mm_srli_si128(v_acc_64, 8));
+ return xx_cvtsi128_si64(v_acc_64);
+}
+
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+ int *sum) {
+ int r = 0;
+ uint64_t sse = 0;
+ do {
+ int curr_sum = 0;
+ sse += aom_sum_sse_2d_i16_4x4_sse2(src, stride, &curr_sum);
+ *sum += curr_sum;
+ src += stride << 2;
+ r += 4;
+ } while (r < height);
+ return sse;
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ int r = 0;
+
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc_q = _mm_setzero_si128();
+
+ do {
+ __m128i v_acc_d = _mm_setzero_si128();
+ int c = 0;
+ do {
+ const int16_t *b = src + c;
+
+ const __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+ const __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+ const __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+ const __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ c += 8;
+ } while (c < width);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 4 * stride;
+ r += 4;
+ } while (r < height);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+ return xx_cvtsi128_si64(v_acc_q);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// aom_sum_sse_2d_i16_nxn_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+uint64_t
+aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height, int *sum) {
+ int r = 0;
+ uint64_t result;
+ const __m128i zero_reg = _mm_setzero_si128();
+ const __m128i one_reg = _mm_set1_epi16(1);
+
+ __m128i v_sse_total = zero_reg;
+ __m128i v_sum_total = zero_reg;
+
+ do {
+ int c = 0;
+ __m128i v_sse_row = zero_reg;
+ do {
+ const int16_t *b = src + c;
+
+ __m128i v_val_0_w = xx_load_128(b + 0 * stride);
+ __m128i v_val_1_w = xx_load_128(b + 1 * stride);
+ __m128i v_val_2_w = xx_load_128(b + 2 * stride);
+ __m128i v_val_3_w = xx_load_128(b + 3 * stride);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sq_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sq_0123_d = _mm_add_epi32(v_sq_01_d, v_sq_23_d);
+ v_sse_row = _mm_add_epi32(v_sse_row, v_sq_0123_d);
+
+ const __m128i v_sum_01 = _mm_add_epi16(v_val_0_w, v_val_1_w);
+ const __m128i v_sum_23 = _mm_add_epi16(v_val_2_w, v_val_3_w);
+ __m128i v_sum_0123_d = _mm_add_epi16(v_sum_01, v_sum_23);
+ v_sum_0123_d = _mm_madd_epi16(v_sum_0123_d, one_reg);
+ v_sum_total = _mm_add_epi32(v_sum_total, v_sum_0123_d);
+
+ c += 8;
+ } while (c < width);
+
+ const __m128i v_sse_row_low = _mm_unpacklo_epi32(v_sse_row, zero_reg);
+ const __m128i v_sse_row_hi = _mm_unpackhi_epi32(v_sse_row, zero_reg);
+ v_sse_row = _mm_add_epi64(v_sse_row_low, v_sse_row_hi);
+ v_sse_total = _mm_add_epi64(v_sse_total, v_sse_row);
+ src += 4 * stride;
+ r += 4;
+ } while (r < height);
+
+ v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 8));
+ v_sum_total = _mm_add_epi32(v_sum_total, _mm_srli_si128(v_sum_total, 4));
+ *sum += _mm_cvtsi128_si32(v_sum_total);
+
+ v_sse_total = _mm_add_epi64(v_sse_total, _mm_srli_si128(v_sse_total, 8));
+ xx_storel_64(&result, v_sse_total);
+ return result;
+}
+
+uint64_t aom_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int width,
+ int height) {
+ // 4 elements per row only requires half an XMM register, so this
+ // must be a special case, but also note that over 75% of all calls
+ // are with size == 4, so it is also the common case.
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_squares_2d_i16_4x4_sse2(src, stride);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_squares_2d_i16_4xn_sse2(src, stride, height);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_squares_2d_i16_nxn_sse2(src, stride, width, height);
+ } else {
+ return aom_sum_squares_2d_i16_c(src, stride, width, height);
+ }
+}
+
+uint64_t aom_sum_sse_2d_i16_sse2(const int16_t *src, int src_stride, int width,
+ int height, int *sum) {
+ if (LIKELY(width == 4 && height == 4)) {
+ return aom_sum_sse_2d_i16_4x4_sse2(src, src_stride, sum);
+ } else if (LIKELY(width == 4 && (height & 3) == 0)) {
+ return aom_sum_sse_2d_i16_4xn_sse2(src, src_stride, height, sum);
+ } else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
+ // Generic case
+ return aom_sum_sse_2d_i16_nxn_sse2(src, src_stride, width, height, sum);
+ } else {
+ return aom_sum_sse_2d_i16_c(src, src_stride, width, height, sum);
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+ const __m128i v_zext_mask_q = xx_set1_64_from_32i(~0);
+ __m128i v_acc0_q = _mm_setzero_si128();
+ __m128i v_acc1_q = _mm_setzero_si128();
+
+ const int16_t *const end = src + n;
+
+ assert(n % 64 == 0);
+
+ while (src < end) {
+ const __m128i v_val_0_w = xx_load_128(src);
+ const __m128i v_val_1_w = xx_load_128(src + 8);
+ const __m128i v_val_2_w = xx_load_128(src + 16);
+ const __m128i v_val_3_w = xx_load_128(src + 24);
+ const __m128i v_val_4_w = xx_load_128(src + 32);
+ const __m128i v_val_5_w = xx_load_128(src + 40);
+ const __m128i v_val_6_w = xx_load_128(src + 48);
+ const __m128i v_val_7_w = xx_load_128(src + 56);
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+ v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+ src += 64;
+ }
+
+ v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+ v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+ return xx_cvtsi128_si64(v_acc0_q);
+}
+
+uint64_t aom_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+ if (n % 64 == 0) {
+ return aom_sum_squares_i16_64n_sse2(src, n);
+ } else if (n > 64) {
+ const uint32_t k = n & ~63u;
+ return aom_sum_squares_i16_64n_sse2(src, k) +
+ aom_sum_squares_i16_c(src + k, n - k);
+ } else {
+ return aom_sum_squares_i16_c(src, n);
+ }
+}
+
+// Accumulate sum of 16-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi16(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 2);
+ vec_a = _mm_add_epi16(vec_a, vtmp);
+ return _mm_extract_epi16(vec_a, 0);
+}
+
+// Accumulate sum of 32-bit elements in the vector
+static AOM_INLINE int32_t mm_accumulate_epi32(__m128i vec_a) {
+ __m128i vtmp = _mm_srli_si128(vec_a, 8);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ vtmp = _mm_srli_si128(vec_a, 4);
+ vec_a = _mm_add_epi32(vec_a, vtmp);
+ return _mm_cvtsi128_si32(vec_a);
+}
+
+uint64_t aom_var_2d_u8_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint8_t *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 16 elements in a row
+ for (i = 0; i < width - 15; i += 16) {
+ srcp = src + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 7; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi8(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi8(vsrc, vzero);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc0);
+ v_acc_sum = _mm_add_epi16(v_acc_sum, vsrc1);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc0, vsrc0);
+ __m128i vsqs1 = _mm_madd_epi16(vsrc1, vsrc1);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs1);
+
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi16(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = src;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint8_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
+
+uint64_t aom_var_2d_u16_sse2(uint8_t *src, int src_stride, int width,
+ int height) {
+ uint16_t *srcp1 = CONVERT_TO_SHORTPTR(src), *srcp;
+ uint64_t s = 0, ss = 0;
+ __m128i vzero = _mm_setzero_si128();
+ __m128i v_acc_sum = vzero;
+ __m128i v_acc_sqs = vzero;
+ int i, j;
+
+ // Process 8 elements in a row
+ for (i = 0; i < width - 8; i += 8) {
+ srcp = srcp1 + i;
+ // Process 8 columns at a time
+ for (j = 0; j < height - 8; j += 8) {
+ __m128i vsrc[8];
+ for (int k = 0; k < 8; k++) {
+ vsrc[k] = _mm_loadu_si128((__m128i *)srcp);
+ srcp += src_stride;
+ }
+ for (int k = 0; k < 8; k++) {
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc[k], vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc[k], vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc[k], vsrc[k]);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process remaining rows (height not a multiple of 8)
+ for (; j < height; j++) {
+ __m128i vsrc = _mm_loadu_si128((__m128i *)srcp);
+ __m128i vsrc0 = _mm_unpacklo_epi16(vsrc, vzero);
+ __m128i vsrc1 = _mm_unpackhi_epi16(vsrc, vzero);
+ v_acc_sum = _mm_add_epi32(vsrc0, v_acc_sum);
+ v_acc_sum = _mm_add_epi32(vsrc1, v_acc_sum);
+
+ __m128i vsqs0 = _mm_madd_epi16(vsrc, vsrc);
+ v_acc_sqs = _mm_add_epi32(v_acc_sqs, vsqs0);
+ srcp += src_stride;
+ }
+
+ // Update total sum and clear the vectors
+ s += mm_accumulate_epi32(v_acc_sum);
+ ss += mm_accumulate_epi32(v_acc_sqs);
+ v_acc_sum = vzero;
+ v_acc_sqs = vzero;
+ }
+
+ // Process the remaining area using C
+ srcp = srcp1;
+ for (int k = 0; k < height; k++) {
+ for (int m = i; m < width; m++) {
+ uint16_t val = srcp[m];
+ s += val;
+ ss += val * val;
+ }
+ srcp += src_stride;
+ }
+ return (ss - s * s / (width * height));
+}
diff --git a/third_party/aom/aom_dsp/x86/sum_squares_sse2.h b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
new file mode 100644
index 0000000000..5ed3f2c7bf
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/sum_squares_sse2.h
@@ -0,0 +1,28 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+#define AOM_DSP_X86_SUM_SQUARES_SSE2_H_
+
+uint64_t aom_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride,
+ int width, int height);
+
+uint64_t aom_sum_squares_2d_i16_4xn_sse2(const int16_t *src, int stride,
+ int height);
+uint64_t aom_sum_squares_2d_i16_4x4_sse2(const int16_t *src, int stride);
+
+uint64_t aom_sum_sse_2d_i16_4x4_sse2(const int16_t *src, int stride, int *sum);
+uint64_t aom_sum_sse_2d_i16_4xn_sse2(const int16_t *src, int stride, int height,
+ int *sum);
+uint64_t aom_sum_sse_2d_i16_nxn_sse2(const int16_t *src, int stride, int width,
+ int height, int *sum);
+
+#endif // AOM_DSP_X86_SUM_SQUARES_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms.h b/third_party/aom/aom_dsp/x86/synonyms.h
new file mode 100644
index 0000000000..6744ec51d0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms.h
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_H_
+
+#include <immintrin.h>
+#include <string.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+ int val;
+ memcpy(&val, a, sizeof(val));
+ return _mm_cvtsi32_si128(val);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i *)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+ return _mm_load_si128((const __m128i *)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i *)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+ const int val = _mm_cvtsi128_si32(v);
+ memcpy(a, &val, sizeof(val));
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i *)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+ _mm_store_si128((__m128i *)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i *)a, v);
+}
+
+// The _mm_set_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set_epi64x()
+// acting on 32-bit integers.
+static INLINE __m128i xx_set_64_from_32i(int32_t e1, int32_t e0) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+ return _mm_set_epi32(0, e1, 0, e0);
+#else
+ return _mm_set_epi64x((uint32_t)e1, (uint32_t)e0);
+#endif
+}
+
+// The _mm_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m128i xx_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && _MSC_VER < 1900
+ return _mm_set_epi32(0, a, 0, a);
+#else
+ return _mm_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Fill an SSE register using an interleaved pair of values, ie. set the
+// 8 channels to {a, b, a, b, a, b, a, b}, using the same channel ordering
+// as when a register is stored to / loaded from memory.
+//
+// This is useful for rearranging filter kernels for use with the _mm_madd_epi16
+// instruction
+static INLINE __m128i xx_set2_epi16(int16_t a, int16_t b) {
+ return _mm_setr_epi16(a, b, a, b, a, b, a, b);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w = _mm_srli_epi16(v_val_w, bits - 1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16_unsigned(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi16(v_val_d, v_bias_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+// This is equivalent to ROUND_POWER_OF_TWO(v_val_d, bits)
+static INLINE __m128i xx_roundn_epi32_unsigned(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi32((1 << bits) >> 1);
+ const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+ return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+ const __m128i v_tmp_d =
+ _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
+#endif // AOM_AOM_DSP_X86_SYNONYMS_H_
diff --git a/third_party/aom/aom_dsp/x86/synonyms_avx2.h b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
new file mode 100644
index 0000000000..b729e5f410
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/synonyms_avx2.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+#define AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
+
+#include <immintrin.h>
+
+#include "config/aom_config.h"
+
+#include "aom/aom_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m256i yy_load_256(const void *a) {
+ return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE __m256i yy_loadu_256(const void *a) {
+ return _mm256_loadu_si256((const __m256i *)a);
+}
+
+static INLINE void yy_store_256(void *const a, const __m256i v) {
+ _mm256_store_si256((__m256i *)a, v);
+}
+
+static INLINE void yy_storeu_256(void *const a, const __m256i v) {
+ _mm256_storeu_si256((__m256i *)a, v);
+}
+
+// The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio
+// compilers. The following function is equivalent to _mm256_set1_epi64x()
+// acting on a 32-bit integer.
+static INLINE __m256i yy_set1_64_from_32i(int32_t a) {
+#if defined(_MSC_VER) && defined(_M_IX86) && _MSC_VER < 1900
+ return _mm256_set_epi32(0, a, 0, a, 0, a, 0, a);
+#else
+ return _mm256_set1_epi64x((uint32_t)a);
+#endif
+}
+
+// Some compilers don't have _mm256_set_m128i defined in immintrin.h. We
+// therefore define an equivalent function using a different intrinsic.
+// ([ hi ], [ lo ]) -> [ hi ][ lo ]
+static INLINE __m256i yy_set_m128i(__m128i hi, __m128i lo) {
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);
+}
+
+static INLINE __m256i yy_loadu2_128(const void *hi, const void *lo) {
+ __m128i mhi = _mm_loadu_si128((const __m128i *)(hi));
+ __m128i mlo = _mm_loadu_si128((const __m128i *)(lo));
+ return yy_set_m128i(mhi, mlo);
+}
+
+static INLINE void yy_storeu2_128(void *hi, void *lo, const __m256i a) {
+ _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1));
+ _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a));
+}
+
+static INLINE __m256i yy_roundn_epu16(__m256i v_val_w, int bits) {
+ const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);
+ return _mm256_avg_epu16(v_s_w, _mm256_setzero_si256());
+}
+#endif // AOM_AOM_DSP_X86_SYNONYMS_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/transpose_sse2.h b/third_party/aom/aom_dsp/x86/transpose_sse2.h
new file mode 100644
index 0000000000..9dab750f44
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/transpose_sse2.h
@@ -0,0 +1,424 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+#define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
+
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+
+static INLINE __m128i transpose_8bit_4x4(const __m128i *const in) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+
+ // Unpack 16 bit elements resulting in:
+ // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ return _mm_unpacklo_epi16(a0, a1);
+}
+
+static INLINE void transpose_8bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 8 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // a1: 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // a2: 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // a3: 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
+
+ // Unpack 16 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
+ // b1: 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
+ // b2: 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
+ // b3: 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
+ const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
+ const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
+
+ // Unpack 32 bit elements resulting in:
+ // c0: 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // c1: 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // c2: 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
+ // c3: 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
+ const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
+ const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
+ const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
+ const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(c0, c0);
+ out[1] = _mm_unpackhi_epi64(c0, c0);
+ out[2] = _mm_unpacklo_epi64(c1, c1);
+ out[3] = _mm_unpackhi_epi64(c1, c1);
+ out[4] = _mm_unpacklo_epi64(c2, c2);
+ out[5] = _mm_unpackhi_epi64(c2, c2);
+ out[6] = _mm_unpacklo_epi64(c3, c3);
+ out[7] = _mm_unpackhi_epi64(c3, c3);
+}
+
+static INLINE void transpose_16bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // out[0]: 00 10 20 30 01 11 21 31
+ // out[1]: 01 11 21 31 __ __ __ __
+ // out[2]: 02 12 22 32 03 13 23 33
+ // out[3]: 03 13 23 33 __ __ __ __
+ //
+ // Note: The high 64 bits of the output registers are shown for informational
+ // purposes only. Callers should only use the low 64 bits of the output
+ // registers. "__" indicates zeros.
+ out[0] = _mm_unpacklo_epi32(a0, a1);
+ out[1] = _mm_srli_si128(out[0], 8);
+ out[2] = _mm_unpackhi_epi32(a0, a1);
+ out[3] = _mm_srli_si128(out[2], 8);
+}
+
+static INLINE void transpose_16bit_4x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 XX XX XX XX
+ // in[1]: 10 11 12 13 XX XX XX XX
+ // in[2]: 20 21 22 23 XX XX XX XX
+ // in[3]: 30 31 32 33 XX XX XX XX
+ // in[4]: 40 41 42 43 XX XX XX XX
+ // in[5]: 50 51 52 53 XX XX XX XX
+ // in[6]: 60 61 62 63 XX XX XX XX
+ // in[7]: 70 71 72 73 XX XX XX XX
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 02 12 22 32 03 13 23 33
+ // b3: 42 52 62 72 43 53 63 73
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b2, b3);
+ out[3] = _mm_unpackhi_epi64(b2, b3);
+}
+
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
+static INLINE void transpose_16bit_8x8(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+ // in[4]: 40 41 42 43 44 45 46 47
+ // in[5]: 50 51 52 53 54 55 56 57
+ // in[6]: 60 61 62 63 64 65 66 67
+ // in[7]: 70 71 72 73 74 75 76 77
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a2: 40 50 41 51 42 52 43 53
+ // a3: 60 70 61 71 62 72 63 73
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ // a6: 44 54 45 55 46 56 47 57
+ // a7: 64 74 65 75 66 76 67 77
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b1: 40 50 60 70 41 51 61 71
+ // b2: 04 14 24 34 05 15 25 35
+ // b3: 44 54 64 74 45 55 65 75
+ // b4: 02 12 22 32 03 13 23 33
+ // b5: 42 52 62 72 43 53 63 73
+ // b6: 06 16 26 36 07 17 27 37
+ // b7: 46 56 66 76 47 57 67 77
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+ const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 40 50 60 70
+ // out[1]: 01 11 21 31 41 51 61 71
+ // out[2]: 02 12 22 32 42 52 62 72
+ // out[3]: 03 13 23 33 43 53 63 73
+ // out[4]: 04 14 24 34 44 54 64 74
+ // out[5]: 05 15 25 35 45 55 65 75
+ // out[6]: 06 16 26 36 46 56 66 76
+ // out[7]: 07 17 27 37 47 57 67 77
+ out[0] = _mm_unpacklo_epi64(b0, b1);
+ out[1] = _mm_unpackhi_epi64(b0, b1);
+ out[2] = _mm_unpacklo_epi64(b4, b5);
+ out[3] = _mm_unpackhi_epi64(b4, b5);
+ out[4] = _mm_unpacklo_epi64(b2, b3);
+ out[5] = _mm_unpackhi_epi64(b2, b3);
+ out[6] = _mm_unpacklo_epi64(b6, b7);
+ out[7] = _mm_unpackhi_epi64(b6, b7);
+}
+
+// Transpose in-place
+static INLINE void transpose_16bit_16x16(__m128i *const left,
+ __m128i *const right) {
+ __m128i tbuf[8];
+ transpose_16bit_8x8(left, left);
+ transpose_16bit_8x8(right, tbuf);
+ transpose_16bit_8x8(left + 8, right);
+ transpose_16bit_8x8(right + 8, right + 8);
+
+ left[8] = tbuf[0];
+ left[9] = tbuf[1];
+ left[10] = tbuf[2];
+ left[11] = tbuf[3];
+ left[12] = tbuf[4];
+ left[13] = tbuf[5];
+ left[14] = tbuf[6];
+ left[15] = tbuf[7];
+}
+
+static INLINE void transpose_32bit_4x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+}
+
+static INLINE void transpose_32bit_4x4x2(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 10 11 12 13
+ // in[2]: 20 21 22 23
+ // in[3]: 30 31 32 33
+ // in[4]: 04 05 06 07
+ // in[5]: 14 15 16 17
+ // in[6]: 24 25 26 27
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+static INLINE void transpose_32bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 32 bit elements. Goes from:
+ // in[0]: 00 01 02 03
+ // in[1]: 04 05 06 07
+ // in[2]: 10 11 12 13
+ // in[3]: 14 15 16 17
+ // in[4]: 20 21 22 23
+ // in[5]: 24 25 26 27
+ // in[6]: 30 31 32 33
+ // in[7]: 34 35 36 37
+ // to:
+ // a0: 00 10 01 11
+ // a1: 20 30 21 31
+ // a2: 02 12 03 13
+ // a3: 22 32 23 33
+ // a4: 04 14 05 15
+ // a5: 24 34 25 35
+ // a6: 06 16 07 17
+ // a7: 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
+ const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
+ const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
+ const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
+ const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
+ const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
+ const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
+ const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
+ // out[4]: 04 14 24 34
+ // out[5]: 05 15 25 35
+ // out[6]: 06 16 26 36
+ // out[7]: 07 17 27 37
+ out[0] = _mm_unpacklo_epi64(a0, a1);
+ out[1] = _mm_unpackhi_epi64(a0, a1);
+ out[2] = _mm_unpacklo_epi64(a2, a3);
+ out[3] = _mm_unpackhi_epi64(a2, a3);
+ out[4] = _mm_unpacklo_epi64(a4, a5);
+ out[5] = _mm_unpackhi_epi64(a4, a5);
+ out[6] = _mm_unpacklo_epi64(a6, a7);
+ out[7] = _mm_unpackhi_epi64(a6, a7);
+}
+
+#endif // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_avx2.h b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
new file mode 100644
index 0000000000..4105250bc0
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_avx2.h
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static INLINE __m256i pair_set_w16_epi16(int16_t a, int16_t b) {
+ return _mm256_set1_epi32(
+ (int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)));
+}
+
+static INLINE void btf_16_w16_avx2(const __m256i w0, const __m256i w1,
+ __m256i *in0, __m256i *in1, const __m256i _r,
+ const int32_t cos_bit) {
+ __m256i t0 = _mm256_unpacklo_epi16(*in0, *in1);
+ __m256i t1 = _mm256_unpackhi_epi16(*in0, *in1);
+ __m256i u0 = _mm256_madd_epi16(t0, w0);
+ __m256i u1 = _mm256_madd_epi16(t1, w0);
+ __m256i v0 = _mm256_madd_epi16(t0, w1);
+ __m256i v1 = _mm256_madd_epi16(t1, w1);
+
+ __m256i a0 = _mm256_add_epi32(u0, _r);
+ __m256i a1 = _mm256_add_epi32(u1, _r);
+ __m256i b0 = _mm256_add_epi32(v0, _r);
+ __m256i b1 = _mm256_add_epi32(v1, _r);
+
+ __m256i c0 = _mm256_srai_epi32(a0, cos_bit);
+ __m256i c1 = _mm256_srai_epi32(a1, cos_bit);
+ __m256i d0 = _mm256_srai_epi32(b0, cos_bit);
+ __m256i d1 = _mm256_srai_epi32(b1, cos_bit);
+
+ *in0 = _mm256_packs_epi32(c0, c1);
+ *in1 = _mm256_packs_epi32(d0, d1);
+}
+
+static INLINE void btf_16_adds_subs_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_adds_epi16(_in0, _in1);
+ *in1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_avx2(__m256i *in0, __m256i *in1) {
+ const __m256i _in0 = *in0;
+ const __m256i _in1 = *in1;
+ *in0 = _mm256_add_epi32(_in0, _in1);
+ *in1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE void btf_16_adds_subs_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_adds_epi16(_in0, _in1);
+ *out1 = _mm256_subs_epi16(_in0, _in1);
+}
+
+static INLINE void btf_32_add_sub_out_avx2(__m256i *out0, __m256i *out1,
+ __m256i in0, __m256i in1) {
+ const __m256i _in0 = in0;
+ const __m256i _in1 = in1;
+ *out0 = _mm256_add_epi32(_in0, _in1);
+ *out1 = _mm256_sub_epi32(_in0, _in1);
+}
+
+static INLINE __m256i load_16bit_to_16bit_avx2(const int16_t *a) {
+ return _mm256_load_si256((const __m256i *)a);
+}
+
+static INLINE void load_buffer_16bit_to_16bit_avx2(const int16_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE void load_buffer_16bit_to_16bit_flip_avx2(const int16_t *in,
+ int stride,
+ __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit_avx2(in + i * stride);
+ }
+}
+
+static INLINE __m256i load_32bit_to_16bit_w16_avx2(const int32_t *a) {
+ const __m256i a_low = _mm256_lddqu_si256((const __m256i *)a);
+ const __m256i b = _mm256_packs_epi32(a_low, *(const __m256i *)(a + 8));
+ return _mm256_permute4x64_epi64(b, 0xD8);
+}
+
+static INLINE void load_buffer_32bit_to_16bit_w16_avx2(const int32_t *in,
+ int stride, __m256i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w16_avx2(in + i * stride);
+ }
+}
+
+static INLINE void transpose2_8x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16], u[16];
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 1) ==> (0, 1)
+ // (2, 3) ==> (2, 3)
+ // (4, 5) ==> (4, 5)
+ // (6, 7) ==> (6, 7)
+ for (int i = 0; i < 4; i++) {
+ t[2 * i] = _mm256_unpacklo_epi16(in[2 * i], in[2 * i + 1]);
+ t[2 * i + 1] = _mm256_unpackhi_epi16(in[2 * i], in[2 * i + 1]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 2) ==> (0, 2)
+ // (1, 3) ==> (1, 3)
+ // (4, 6) ==> (4, 6)
+ // (5, 7) ==> (5, 7)
+ for (int i = 0; i < 2; i++) {
+ u[i] = _mm256_unpacklo_epi32(t[i], t[i + 2]);
+ u[i + 2] = _mm256_unpackhi_epi32(t[i], t[i + 2]);
+
+ u[i + 4] = _mm256_unpacklo_epi32(t[i + 4], t[i + 6]);
+ u[i + 6] = _mm256_unpackhi_epi32(t[i + 4], t[i + 6]);
+ }
+
+ // (1st, 2nd) ==> (lo, hi)
+ // (0, 4) ==> (0, 1)
+ // (1, 5) ==> (4, 5)
+ // (2, 6) ==> (2, 3)
+ // (3, 7) ==> (6, 7)
+ for (int i = 0; i < 2; i++) {
+ out[2 * i] = _mm256_unpacklo_epi64(u[2 * i], u[2 * i + 4]);
+ out[2 * i + 1] = _mm256_unpackhi_epi64(u[2 * i], u[2 * i + 4]);
+
+ out[2 * i + 4] = _mm256_unpacklo_epi64(u[2 * i + 1], u[2 * i + 5]);
+ out[2 * i + 5] = _mm256_unpackhi_epi64(u[2 * i + 1], u[2 * i + 5]);
+ }
+}
+
+static INLINE void transpose_16bit_16x16_avx2(const __m256i *const in,
+ __m256i *const out) {
+ __m256i t[16];
+
+#define LOADL(idx) \
+ t[idx] = _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx])); \
+ t[idx] = _mm256_inserti128_si256( \
+ t[idx], _mm_load_si128((__m128i const *)&in[idx + 8]), 1);
+
+#define LOADR(idx) \
+ t[8 + idx] = \
+ _mm256_castsi128_si256(_mm_load_si128((__m128i const *)&in[idx] + 1)); \
+ t[8 + idx] = _mm256_inserti128_si256( \
+ t[8 + idx], _mm_load_si128((__m128i const *)&in[idx + 8] + 1), 1);
+
+ // load left 8x16
+ LOADL(0)
+ LOADL(1)
+ LOADL(2)
+ LOADL(3)
+ LOADL(4)
+ LOADL(5)
+ LOADL(6)
+ LOADL(7)
+
+ // load right 8x16
+ LOADR(0)
+ LOADR(1)
+ LOADR(2)
+ LOADR(3)
+ LOADR(4)
+ LOADR(5)
+ LOADR(6)
+ LOADR(7)
+
+ // get the top 16x8 result
+ transpose2_8x8_avx2(t, out);
+ // get the bottom 16x8 result
+ transpose2_8x8_avx2(&t[8], &out[8]);
+}
+
+static INLINE void transpose_16bit_16x8_avx2(const __m256i *const in,
+ __m256i *const out) {
+ const __m256i a0 = _mm256_unpacklo_epi16(in[0], in[1]);
+ const __m256i a1 = _mm256_unpacklo_epi16(in[2], in[3]);
+ const __m256i a2 = _mm256_unpacklo_epi16(in[4], in[5]);
+ const __m256i a3 = _mm256_unpacklo_epi16(in[6], in[7]);
+ const __m256i a4 = _mm256_unpackhi_epi16(in[0], in[1]);
+ const __m256i a5 = _mm256_unpackhi_epi16(in[2], in[3]);
+ const __m256i a6 = _mm256_unpackhi_epi16(in[4], in[5]);
+ const __m256i a7 = _mm256_unpackhi_epi16(in[6], in[7]);
+
+ const __m256i b0 = _mm256_unpacklo_epi32(a0, a1);
+ const __m256i b1 = _mm256_unpacklo_epi32(a2, a3);
+ const __m256i b2 = _mm256_unpacklo_epi32(a4, a5);
+ const __m256i b3 = _mm256_unpacklo_epi32(a6, a7);
+ const __m256i b4 = _mm256_unpackhi_epi32(a0, a1);
+ const __m256i b5 = _mm256_unpackhi_epi32(a2, a3);
+ const __m256i b6 = _mm256_unpackhi_epi32(a4, a5);
+ const __m256i b7 = _mm256_unpackhi_epi32(a6, a7);
+
+ out[0] = _mm256_unpacklo_epi64(b0, b1);
+ out[1] = _mm256_unpackhi_epi64(b0, b1);
+ out[2] = _mm256_unpacklo_epi64(b4, b5);
+ out[3] = _mm256_unpackhi_epi64(b4, b5);
+ out[4] = _mm256_unpacklo_epi64(b2, b3);
+ out[5] = _mm256_unpackhi_epi64(b2, b3);
+ out[6] = _mm256_unpacklo_epi64(b6, b7);
+ out[7] = _mm256_unpackhi_epi64(b6, b7);
+}
+
+static INLINE void flip_buf_avx2(__m256i *in, __m256i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
+static INLINE void round_shift_16bit_w16_avx2(__m256i *in, int size, int bit) {
+ if (bit < 0) {
+ bit = -bit;
+ __m256i round = _mm256_set1_epi16(1 << (bit - 1));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_adds_epi16(in[i], round);
+ in[i] = _mm256_srai_epi16(in[i], bit);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm256_slli_epi16(in[i], bit);
+ }
+ }
+}
+
+static INLINE __m256i round_shift_32_avx2(__m256i vec, int bit) {
+ __m256i tmp, round;
+ round = _mm256_set1_epi32(1 << (bit - 1));
+ tmp = _mm256_add_epi32(vec, round);
+ return _mm256_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_avx2(__m256i *input, __m256i *output,
+ const int size, const int bit) {
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = round_shift_32_avx2(input[i], bit);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ output[i] = _mm256_slli_epi32(input[i], -bit);
+ }
+ }
+}
+
+static INLINE void round_shift_rect_array_32_avx2(__m256i *input,
+ __m256i *output,
+ const int size, const int bit,
+ const int val) {
+ const __m256i sqrt2 = _mm256_set1_epi32(val);
+ if (bit > 0) {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = round_shift_32_avx2(input[i], bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ } else {
+ int i;
+ for (i = 0; i < size; i++) {
+ const __m256i r0 = _mm256_slli_epi32(input[i], -bit);
+ const __m256i r1 = _mm256_mullo_epi32(sqrt2, r0);
+ output[i] = round_shift_32_avx2(r1, NewSqrt2Bits);
+ }
+ }
+}
+
+static INLINE __m256i scale_round_avx2(const __m256i a, const int scale) {
+ const __m256i scale_rounding =
+ pair_set_w16_epi16(scale, 1 << (NewSqrt2Bits - 1));
+ const __m256i b = _mm256_madd_epi16(a, scale_rounding);
+ return _mm256_srai_epi32(b, NewSqrt2Bits);
+}
+
+static INLINE void store_rect_16bit_to_32bit_w8_avx2(const __m256i a,
+ int32_t *const b) {
+ const __m256i one = _mm256_set1_epi16(1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, one);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, one);
+ const __m256i b_lo = scale_round_avx2(a_lo, NewSqrt2);
+ const __m256i b_hi = scale_round_avx2(a_hi, NewSqrt2);
+ const __m256i temp = _mm256_permute2f128_si256(b_lo, b_hi, 0x31);
+ _mm_store_si128((__m128i *)b, _mm256_castsi256_si128(b_lo));
+ _mm_store_si128((__m128i *)(b + 4), _mm256_castsi256_si128(b_hi));
+ _mm256_store_si256((__m256i *)(b + 64), temp);
+}
+
+static INLINE void store_rect_buffer_16bit_to_32bit_w8_avx2(
+ const __m256i *const in, int32_t *const out, const int stride,
+ const int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ store_rect_16bit_to_32bit_w8_avx2(in[i], out + i * stride);
+ }
+}
+
+static INLINE void pack_reg(const __m128i *in1, const __m128i *in2,
+ __m256i *out) {
+ out[0] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[0]), in2[0], 0x1);
+ out[1] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[1]), in2[1], 0x1);
+ out[2] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[2]), in2[2], 0x1);
+ out[3] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[3]), in2[3], 0x1);
+ out[4] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[4]), in2[4], 0x1);
+ out[5] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[5]), in2[5], 0x1);
+ out[6] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[6]), in2[6], 0x1);
+ out[7] = _mm256_insertf128_si256(_mm256_castsi128_si256(in1[7]), in2[7], 0x1);
+}
+
+static INLINE void extract_reg(const __m256i *in, __m128i *out1) {
+ out1[0] = _mm256_castsi256_si128(in[0]);
+ out1[1] = _mm256_castsi256_si128(in[1]);
+ out1[2] = _mm256_castsi256_si128(in[2]);
+ out1[3] = _mm256_castsi256_si128(in[3]);
+ out1[4] = _mm256_castsi256_si128(in[4]);
+ out1[5] = _mm256_castsi256_si128(in[5]);
+ out1[6] = _mm256_castsi256_si128(in[6]);
+ out1[7] = _mm256_castsi256_si128(in[7]);
+
+ out1[8] = _mm256_extracti128_si256(in[0], 0x01);
+ out1[9] = _mm256_extracti128_si256(in[1], 0x01);
+ out1[10] = _mm256_extracti128_si256(in[2], 0x01);
+ out1[11] = _mm256_extracti128_si256(in[3], 0x01);
+ out1[12] = _mm256_extracti128_si256(in[4], 0x01);
+ out1[13] = _mm256_extracti128_si256(in[5], 0x01);
+ out1[14] = _mm256_extracti128_si256(in[6], 0x01);
+ out1[15] = _mm256_extracti128_si256(in[7], 0x01);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_AVX2_H_
diff --git a/third_party/aom/aom_dsp/x86/txfm_common_sse2.h b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
new file mode 100644
index 0000000000..9c99eb93bd
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/txfm_common_sse2.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#ifndef AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+#define AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
+
+#include <emmintrin.h>
+#include "aom/aom_integer.h"
+#include "aom_dsp/x86/synonyms.h"
+
+#define pair_set_epi16(a, b) \
+ _mm_set1_epi32((int32_t)(((uint16_t)(a)) | (((uint32_t)(b)) << 16)))
+
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+ const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+ const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+ return _mm_shuffle_epi32(b, 0x4e);
+}
+
+#define octa_set_epi16(a, b, c, d, e, f, g, h) \
+ _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
+ (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
+
+#endif // AOM_AOM_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/third_party/aom/aom_dsp/x86/variance_avx2.c b/third_party/aom/aom_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000000..046d6f10f8
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_avx2.c
@@ -0,0 +1,961 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h>
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/masked_variance_intrin_ssse3.h"
+#include "aom_dsp/x86/synonyms.h"
+
+static INLINE __m128i mm256_add_hi_lo_epi16(const __m256i val) {
+ return _mm_add_epi16(_mm256_castsi256_si128(val),
+ _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE __m128i mm256_add_hi_lo_epi32(const __m256i val) {
+ return _mm_add_epi32(_mm256_castsi256_si128(val),
+ _mm256_extractf128_si256(val, 1));
+}
+
+static INLINE void variance_kernel_avx2(const __m256i src, const __m256i ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i adj_sub = _mm256_set1_epi16((short)0xff01); // (1,-1)
+
+ // unpack into pairs of source and reference values
+ const __m256i src_ref0 = _mm256_unpacklo_epi8(src, ref);
+ const __m256i src_ref1 = _mm256_unpackhi_epi8(src, ref);
+
+ // subtract adjacent elements using src*1 + ref*-1
+ const __m256i diff0 = _mm256_maddubs_epi16(src_ref0, adj_sub);
+ const __m256i diff1 = _mm256_maddubs_epi16(src_ref1, adj_sub);
+ const __m256i madd0 = _mm256_madd_epi16(diff0, diff0);
+ const __m256i madd1 = _mm256_madd_epi16(diff1, diff1);
+
+ // add to the running totals
+ *sum = _mm256_add_epi16(*sum, _mm256_add_epi16(diff0, diff1));
+ *sse = _mm256_add_epi32(*sse, _mm256_add_epi32(madd0, madd1));
+}
+
+static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);
+
+ // unpack sse and sum registers and add
+ const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
+ const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);
+
+ // perform the final summation and extract the results
+ const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
+ *((int *)sse) = _mm_cvtsi128_si32(res);
+ return _mm_extract_epi32(res, 1);
+}
+
+// handle pixels (<= 512)
+static INLINE int variance_final_512_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+ const __m128i vsum_64 = _mm_add_epi16(vsum_128, _mm_srli_si128(vsum_128, 8));
+ const __m128i sum_int32 = _mm_cvtepi16_epi32(vsum_64);
+ return variance_final_from_32bit_sum_avx2(vsse, sum_int32, sse);
+}
+
+// handle 1024 pixels (32x32, 16x64, 64x16)
+static INLINE int variance_final_1024_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ // extract the low lane and add it to the high lane
+ const __m128i vsum_128 = mm256_add_hi_lo_epi16(vsum);
+ const __m128i vsum_64 =
+ _mm_add_epi32(_mm_cvtepi16_epi32(vsum_128),
+ _mm_cvtepi16_epi32(_mm_srli_si128(vsum_128, 8)));
+ return variance_final_from_32bit_sum_avx2(vsse, vsum_64, sse);
+}
+
+static INLINE __m256i sum_to_32bit_avx2(const __m256i sum) {
+ const __m256i sum_lo = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum));
+ const __m256i sum_hi =
+ _mm256_cvtepi16_epi32(_mm256_extractf128_si256(sum, 1));
+ return _mm256_add_epi32(sum_lo, sum_hi);
+}
+
+// handle 2048 pixels (32x64, 64x32)
+static INLINE int variance_final_2048_avx2(__m256i vsse, __m256i vsum,
+ unsigned int *const sse) {
+ vsum = sum_to_32bit_avx2(vsum);
+ const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum);
+ return variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse);
+}
+
+static INLINE void variance16_kernel_avx2(
+ const uint8_t *const src, const int src_stride, const uint8_t *const ref,
+ const int ref_stride, __m256i *const sse, __m256i *const sum) {
+ const __m128i s0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+ const __m128i s1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+ const __m128i r0 = _mm_loadu_si128((__m128i const *)(ref + 0 * ref_stride));
+ const __m128i r1 = _mm_loadu_si128((__m128i const *)(ref + 1 * ref_stride));
+ const __m256i s = _mm256_inserti128_si256(_mm256_castsi128_si256(s0), s1, 1);
+ const __m256i r = _mm256_inserti128_si256(_mm256_castsi128_si256(r0), r1, 1);
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance32_kernel_avx2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m256i *const sse,
+ __m256i *const sum) {
+ const __m256i s = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r = _mm256_loadu_si256((__m256i const *)(ref));
+ variance_kernel_avx2(s, r, sse, sum);
+}
+
+static INLINE void variance16_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i += 2) {
+ variance16_kernel_avx2(src, src_stride, ref, ref_stride, vsse, vsum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance32_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src, ref, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance128_avx2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m256i *const vsse,
+ __m256i *const vsum) {
+ *vsum = _mm256_setzero_si256();
+
+ for (int i = 0; i < h; i++) {
+ variance32_kernel_avx2(src + 0, ref + 0, vsse, vsum);
+ variance32_kernel_avx2(src + 32, ref + 32, vsse, vsum);
+ variance32_kernel_avx2(src + 64, ref + 64, vsse, vsum);
+ variance32_kernel_avx2(src + 96, ref + 96, vsse, vsum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+#define AOM_VAR_NO_LOOP_AVX2(bw, bh, bits, max_pixel) \
+ unsigned int aom_variance##bw##x##bh##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m256i vsse = _mm256_setzero_si256(); \
+ __m256i vsum; \
+ variance##bw##_avx2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
+ const int sum = variance_final_##max_pixel##_avx2(vsse, vsum, sse); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_NO_LOOP_AVX2(16, 8, 7, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 16, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_AVX2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_AVX2(32, 32, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 64, 11, 2048)
+
+AOM_VAR_NO_LOOP_AVX2(64, 32, 11, 2048)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_AVX2(64, 16, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(32, 8, 8, 512)
+AOM_VAR_NO_LOOP_AVX2(16, 64, 10, 1024)
+AOM_VAR_NO_LOOP_AVX2(16, 4, 6, 512)
+#endif
+
+#define AOM_VAR_LOOP_AVX2(bw, bh, bits, uh) \
+ unsigned int aom_variance##bw##x##bh##_avx2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m256i vsse = _mm256_setzero_si256(); \
+ __m256i vsum = _mm256_setzero_si256(); \
+ for (int i = 0; i < (bh / uh); i++) { \
+ __m256i vsum16; \
+ variance##bw##_avx2(src, src_stride, ref, ref_stride, uh, &vsse, \
+ &vsum16); \
+ vsum = _mm256_add_epi32(vsum, sum_to_32bit_avx2(vsum16)); \
+ src += uh * src_stride; \
+ ref += uh * ref_stride; \
+ } \
+ const __m128i vsum_128 = mm256_add_hi_lo_epi32(vsum); \
+ const int sum = variance_final_from_32bit_sum_avx2(vsse, vsum_128, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_LOOP_AVX2(64, 64, 12, 32) // 64x32 * ( 64/32)
+AOM_VAR_LOOP_AVX2(64, 128, 13, 32) // 64x32 * (128/32)
+AOM_VAR_LOOP_AVX2(128, 64, 13, 16) // 128x16 * ( 64/16)
+AOM_VAR_LOOP_AVX2(128, 128, 14, 16) // 128x16 * (128/16)
+
+unsigned int aom_mse16x16_avx2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x16_avx2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+static INLINE __m256i mm256_loadu2(const uint8_t *p0, const uint8_t *p1) {
+ const __m256i d =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+ return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) {
+ const __m256i d =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1));
+ return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1);
+}
+
+static INLINE void comp_mask_pred_line_avx2(const __m256i s0, const __m256i s1,
+ const __m256i a,
+ uint8_t *comp_pred) {
+ const __m256i alpha_max = _mm256_set1_epi8(AOM_BLEND_A64_MAX_ALPHA);
+ const int16_t round_bits = 15 - AOM_BLEND_A64_ROUND_BITS;
+ const __m256i round_offset = _mm256_set1_epi16(1 << (round_bits));
+
+ const __m256i ma = _mm256_sub_epi8(alpha_max, a);
+
+ const __m256i ssAL = _mm256_unpacklo_epi8(s0, s1);
+ const __m256i aaAL = _mm256_unpacklo_epi8(a, ma);
+ const __m256i ssAH = _mm256_unpackhi_epi8(s0, s1);
+ const __m256i aaAH = _mm256_unpackhi_epi8(a, ma);
+
+ const __m256i blendAL = _mm256_maddubs_epi16(ssAL, aaAL);
+ const __m256i blendAH = _mm256_maddubs_epi16(ssAH, aaAH);
+ const __m256i roundAL = _mm256_mulhrs_epi16(blendAL, round_offset);
+ const __m256i roundAH = _mm256_mulhrs_epi16(blendAH, round_offset);
+
+ const __m256i roundA = _mm256_packus_epi16(roundAL, roundAH);
+ _mm256_storeu_si256((__m256i *)(comp_pred), roundA);
+}
+
+void aom_comp_avg_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride) {
+ int row = 0;
+ if (width == 8) {
+ do {
+ const __m256i pred_0123 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m128i ref_0 = _mm_loadl_epi64((const __m128i *)(ref));
+ const __m128i ref_1 =
+ _mm_loadl_epi64((const __m128i *)(ref + ref_stride));
+ const __m128i ref_2 =
+ _mm_loadl_epi64((const __m128i *)(ref + 2 * ref_stride));
+ const __m128i ref_3 =
+ _mm_loadl_epi64((const __m128i *)(ref + 3 * ref_stride));
+ const __m128i ref_01 = _mm_unpacklo_epi64(ref_0, ref_1);
+ const __m128i ref_23 = _mm_unpacklo_epi64(ref_2, ref_3);
+
+ const __m256i ref_0123 =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(ref_01), ref_23, 1);
+ const __m256i average = _mm256_avg_epu8(pred_0123, ref_0123);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average);
+
+ row += 4;
+ pred += 32;
+ comp_pred += 32;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 16) {
+ do {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+ const __m256i tmp0 =
+ _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+ const __m256i ref_0 = _mm256_inserti128_si256(
+ tmp0, _mm_loadu_si128((const __m128i *)(ref + ref_stride)), 1);
+ const __m256i tmp1 = _mm256_castsi128_si256(
+ _mm_loadu_si128((const __m128i *)(ref + 2 * ref_stride)));
+ const __m256i ref_1 = _mm256_inserti128_si256(
+ tmp1, _mm_loadu_si128((const __m128i *)(ref + 3 * ref_stride)), 1);
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 4;
+ pred += 64;
+ comp_pred += 64;
+ ref += 4 * ref_stride;
+ } while (row < height);
+ } else if (width == 32) {
+ do {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred));
+ const __m256i pred_1 = _mm256_loadu_si256((const __m256i *)(pred + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 32), average_1);
+
+ row += 2;
+ pred += 64;
+ comp_pred += 64;
+ ref += 2 * ref_stride;
+ } while (row < height);
+ } else if (width % 64 == 0) {
+ do {
+ for (int x = 0; x < width; x += 64) {
+ const __m256i pred_0 = _mm256_loadu_si256((const __m256i *)(pred + x));
+ const __m256i pred_1 =
+ _mm256_loadu_si256((const __m256i *)(pred + x + 32));
+ const __m256i ref_0 = _mm256_loadu_si256((const __m256i *)(ref + x));
+ const __m256i ref_1 =
+ _mm256_loadu_si256((const __m256i *)(ref + x + 32));
+ const __m256i average_0 = _mm256_avg_epu8(pred_0, ref_0);
+ const __m256i average_1 = _mm256_avg_epu8(pred_1, ref_1);
+ _mm256_storeu_si256((__m256i *)(comp_pred + x), average_0);
+ _mm256_storeu_si256((__m256i *)(comp_pred + x + 32), average_1);
+ }
+ row++;
+ pred += width;
+ comp_pred += width;
+ ref += ref_stride;
+ } while (row < height);
+ } else {
+ aom_comp_avg_pred_c(comp_pred, pred, width, height, ref, ref_stride);
+ }
+}
+
+void aom_comp_mask_pred_avx2(uint8_t *comp_pred, const uint8_t *pred, int width,
+ int height, const uint8_t *ref, int ref_stride,
+ const uint8_t *mask, int mask_stride,
+ int invert_mask) {
+ int i = 0;
+ const uint8_t *src0 = invert_mask ? pred : ref;
+ const uint8_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ if (width == 8) {
+ comp_mask_pred_8_ssse3(comp_pred, height, src0, stride0, src1, stride1,
+ mask, mask_stride);
+ } else if (width == 16) {
+ do {
+ const __m256i sA0 = mm256_loadu2(src0 + stride0, src0);
+ const __m256i sA1 = mm256_loadu2(src1 + stride1, src1);
+ const __m256i aA = mm256_loadu2(mask + mask_stride, mask);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ const __m256i sB0 = mm256_loadu2(src0 + stride0, src0);
+ const __m256i sB1 = mm256_loadu2(src1 + stride1, src1);
+ const __m256i aB = mm256_loadu2(mask + mask_stride, mask);
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ // comp_pred's stride == width == 16
+ comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+ comp_mask_pred_line_avx2(sB0, sB1, aB, comp_pred + 32);
+ comp_pred += (16 << 2);
+ i += 4;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ const __m256i sA0 = _mm256_lddqu_si256((const __m256i *)(src0 + x));
+ const __m256i sA1 = _mm256_lddqu_si256((const __m256i *)(src1 + x));
+ const __m256i aA = _mm256_lddqu_si256((const __m256i *)(mask + x));
+
+ comp_mask_pred_line_avx2(sA0, sA1, aA, comp_pred);
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i++;
+ } while (i < height);
+ }
+}
+
+static INLINE __m256i highbd_comp_mask_pred_line_avx2(const __m256i s0,
+ const __m256i s1,
+ const __m256i a) {
+ const __m256i alpha_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m256i round_const =
+ _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m256i a_inv = _mm256_sub_epi16(alpha_max, a);
+
+ const __m256i s_lo = _mm256_unpacklo_epi16(s0, s1);
+ const __m256i a_lo = _mm256_unpacklo_epi16(a, a_inv);
+ const __m256i pred_lo = _mm256_madd_epi16(s_lo, a_lo);
+ const __m256i pred_l = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_lo, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i s_hi = _mm256_unpackhi_epi16(s0, s1);
+ const __m256i a_hi = _mm256_unpackhi_epi16(a, a_inv);
+ const __m256i pred_hi = _mm256_madd_epi16(s_hi, a_hi);
+ const __m256i pred_h = _mm256_srai_epi32(
+ _mm256_add_epi32(pred_hi, round_const), AOM_BLEND_A64_ROUND_BITS);
+
+ const __m256i comp = _mm256_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m256i zero = _mm256_setzero_si256();
+
+ if (width == 8) {
+ do {
+ const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
+ const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);
+
+ const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));
+
+ __m256i m = _mm256_castsi128_si256(m_l);
+ m = _mm256_insertf128_si256(m, m_h, 1);
+ const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));
+
+ _mm_storeu_si128((__m128i *)(comp_pred + width),
+ _mm256_extractf128_si256(comp, 1));
+
+ src0 += (stride0 << 1);
+ src1 += (stride1 << 1);
+ mask += (mask_stride << 1);
+ comp_pred += (width << 1);
+ i += 2;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
+ const __m256i m_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0 + x));
+ const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + x + 16));
+ const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1 + x));
+ const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + x + 16));
+
+ const __m256i m01_16 =
+ _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + x)));
+ const __m256i m23_16 = _mm256_cvtepu8_epi16(
+ _mm_loadu_si128((const __m128i *)(mask + x + 16)));
+
+ const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
+ const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);
+
+ _mm256_storeu_si256((__m256i *)comp_pred, comp);
+ _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);
+
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i += 1;
+ } while (i < height);
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8;
+ __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16;
+ __m256i src0_8x16, src1_8x16, dst_16x16, src_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ for (int i = 0; i < h; i += 4) {
+ dst0_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+ dst2_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 2) * dstride]));
+ dst3_4x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 3) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi64(_mm_unpacklo_epi32(dst0_4x8, dst1_4x8),
+ _mm_unpacklo_epi32(dst2_4x8, dst3_4x8));
+ dst_16x16 = _mm256_cvtepu8_epi16(dst_16x8);
+
+ src0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ src1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride]));
+ src3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride]));
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(src0_4x16, src1_4x16));
+ src1_8x16 =
+ _mm256_castsi128_si256(_mm_unpacklo_epi64(src2_4x16, src3_4x16));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+ // accumulation of result
+ square_result = _mm256_add_epi32(square_result, src_16x16);
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+// Compute mse of four consecutive 4x4 blocks.
+// In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int src_blk_stride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8;
+ __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result_0, sub_result_1, sub_result_2, sub_result_3;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = zeros;
+ uint16_t *src_temp = src;
+
+ for (int i = 0; i < h; i += 4) {
+ dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+ dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+ dst2_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 2) * dstride]));
+ dst3_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 3) * dstride]));
+
+ // row0 of 1st,2nd, 3rd and 4th 4x4 blocks- d00 d10 d20 d30
+ dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+ // row1 of 1st,2nd, 3rd and 4th 4x4 blocks - d01 d11 d21 d31
+ dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+ // row2 of 1st,2nd, 3rd and 4th 4x4 blocks - d02 d12 d22 d32
+ dst2_16x16 = _mm256_cvtepu8_epi16(dst2_16x8);
+ // row3 of 1st,2nd, 3rd and 4th 4x4 blocks - d03 d13 d23 d33
+ dst3_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+ // All rows of 1st 4x4 block - r00 r01 r02 r03
+ __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+ // All rows of 2nd 4x4 block - r10 r11 r12 r13
+ __m256i src1_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+ // All rows of 3rd 4x4 block - r20 r21 r22 r23
+ __m256i src2_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[2 * src_blk_stride]));
+ // All rows of 4th 4x4 block - r30 r31 r32 r33
+ __m256i src3_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[3 * src_blk_stride]));
+
+ // r00 r10 r02 r12
+ __m256i tmp0_16x16 = _mm256_unpacklo_epi64(src0_16x16, src1_16x16);
+ // r01 r11 r03 r13
+ __m256i tmp1_16x16 = _mm256_unpackhi_epi64(src0_16x16, src1_16x16);
+ // r20 r30 r22 r32
+ __m256i tmp2_16x16 = _mm256_unpacklo_epi64(src2_16x16, src3_16x16);
+ // r21 r31 r23 r33
+ __m256i tmp3_16x16 = _mm256_unpackhi_epi64(src2_16x16, src3_16x16);
+
+ // r00 r10 r20 r30
+ src0_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x20);
+ // r01 r11 r21 r31
+ src1_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x20);
+ // r02 r12 r22 r32
+ src2_16x16 = _mm256_permute2f128_si256(tmp0_16x16, tmp2_16x16, 0x31);
+ // r03 r13 r23 r33
+ src3_16x16 = _mm256_permute2f128_si256(tmp1_16x16, tmp3_16x16, 0x31);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(src0_16x16, dst0_16x16));
+ sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(src1_16x16, dst1_16x16));
+ sub_result_2 = _mm256_abs_epi16(_mm256_sub_epi16(src2_16x16, dst2_16x16));
+ sub_result_3 = _mm256_abs_epi16(_mm256_sub_epi16(src3_16x16, dst3_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+ src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+ src2_16x16 = _mm256_madd_epi16(sub_result_2, sub_result_2);
+ src3_16x16 = _mm256_madd_epi16(sub_result_3, sub_result_3);
+
+ // accumulation of result
+ src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+ src2_16x16 = _mm256_add_epi32(src2_16x16, src3_16x16);
+ const __m256i square_result_0 = _mm256_add_epi32(src0_16x16, src2_16x16);
+ square_result = _mm256_add_epi32(square_result, square_result_0);
+ src_temp += 16;
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_8x8, dst1_8x8, dst3_16x8;
+ __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ dst1_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride]));
+ dst3_16x8 = _mm_unpacklo_epi64(dst0_8x8, dst1_8x8);
+ dst_16x16 = _mm256_cvtepu8_epi16(dst3_16x8);
+
+ src0_8x16 =
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride]));
+ src1_8x16 = _mm256_castsi128_si256(
+ _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride]));
+ src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20);
+
+ // r15 r14 r13 - - - r1 r0 - 16 bit
+ sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit
+ src_16x16 = _mm256_madd_epi16(sub_result, sub_result);
+
+ // accumulation of result
+ square_result = _mm256_add_epi32(square_result, src_16x16);
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+// Compute mse of two consecutive 8x8 blocks.
+// In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially.
+// Hence src_blk_stride is same as block width. Whereas dst buffer is a frame
+// buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int src_blk_stride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_16x8, dst1_16x8;
+ __m256i dst0_16x16, dst1_16x16;
+ __m256i res0_4x64, res1_4x64;
+ __m256i sub_result_0, sub_result_1;
+ const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128());
+ __m256i square_result = zeros;
+ uint16_t *src_temp = src;
+
+ for (int i = 0; i < h; i += 2) {
+ dst0_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 0) * dstride]));
+ dst1_16x8 = _mm_loadu_si128((__m128i *)(&dst[(i + 1) * dstride]));
+
+ // row0 of 1st and 2nd 8x8 block - d00 d10
+ dst0_16x16 = _mm256_cvtepu8_epi16(dst0_16x8);
+ // row1 of 1st and 2nd 8x8 block - d01 d11
+ dst1_16x16 = _mm256_cvtepu8_epi16(dst1_16x8);
+
+ // 2 rows of 1st 8x8 block - r00 r01
+ __m256i src0_16x16 = _mm256_loadu_si256((__m256i const *)(&src_temp[0]));
+ // 2 rows of 2nd 8x8 block - r10 r11
+ __m256i src1_16x16 =
+ _mm256_loadu_si256((__m256i const *)(&src_temp[src_blk_stride]));
+ // r00 r10 - 128bit
+ __m256i tmp0_16x16 =
+ _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x20);
+ // r01 r11 - 128bit
+ __m256i tmp1_16x16 =
+ _mm256_permute2f128_si256(src0_16x16, src1_16x16, 0x31);
+
+ // r15 r14 r13------------r1 r0 - 16 bit
+ sub_result_0 = _mm256_abs_epi16(_mm256_sub_epi16(tmp0_16x16, dst0_16x16));
+ sub_result_1 = _mm256_abs_epi16(_mm256_sub_epi16(tmp1_16x16, dst1_16x16));
+
+ // s7 s6 s5 s4 s3 s2 s1 s0 - 32bit each
+ src0_16x16 = _mm256_madd_epi16(sub_result_0, sub_result_0);
+ src1_16x16 = _mm256_madd_epi16(sub_result_1, sub_result_1);
+
+ // accumulation of result
+ src0_16x16 = _mm256_add_epi32(src0_16x16, src1_16x16);
+ square_result = _mm256_add_epi32(square_result, src0_16x16);
+ src_temp += 16;
+ }
+
+ // s5 s4 s1 s0 - 64bit
+ res0_4x64 = _mm256_unpacklo_epi32(square_result, zeros);
+ // s7 s6 s3 s2 - 64bit
+ res1_4x64 = _mm256_unpackhi_epi32(square_result, zeros);
+ // r3 r2 r1 r0 - 64bit
+ res0_4x64 = _mm256_add_epi64(res0_4x64, res1_4x64);
+ // r1+r3 r2+r0 - 64bit
+ const __m128i sum_1x64 =
+ _mm_add_epi64(_mm256_castsi256_si128(res0_4x64),
+ _mm256_extracti128_si256(res0_4x64, 1));
+ xx_storel_64(&sum, _mm_add_epi64(sum_1x64, _mm_srli_si128(sum_1x64, 8)));
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+// Computes mse of two 8x8 or four 4x4 consecutive blocks. Luma plane uses 8x8
+// block and Chroma uses 4x4 block. In src buffer, each block in a filter block
+// is stored sequentially. Hence src_blk_stride is same as block width. Whereas
+// dst buffer is a frame buffer, thus dstride is a frame level stride.
+uint64_t aom_mse_16xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ switch (w) {
+ case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h);
+ case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+static INLINE void calc_sum_sse_wd32_avx2(const uint8_t *src,
+ const uint8_t *ref,
+ __m256i set_one_minusone,
+ __m256i sse_8x16[2],
+ __m256i sum_8x16[2]) {
+ const __m256i s00_256 = _mm256_loadu_si256((__m256i const *)(src));
+ const __m256i r00_256 = _mm256_loadu_si256((__m256i const *)(ref));
+
+ const __m256i u_low_256 = _mm256_unpacklo_epi8(s00_256, r00_256);
+ const __m256i u_high_256 = _mm256_unpackhi_epi8(s00_256, r00_256);
+
+ const __m256i diff0 = _mm256_maddubs_epi16(u_low_256, set_one_minusone);
+ const __m256i diff1 = _mm256_maddubs_epi16(u_high_256, set_one_minusone);
+
+ sse_8x16[0] = _mm256_add_epi32(sse_8x16[0], _mm256_madd_epi16(diff0, diff0));
+ sse_8x16[1] = _mm256_add_epi32(sse_8x16[1], _mm256_madd_epi16(diff1, diff1));
+ sum_8x16[0] = _mm256_add_epi16(sum_8x16[0], diff0);
+ sum_8x16[1] = _mm256_add_epi16(sum_8x16[1], diff1);
+}
+
+static INLINE __m256i calc_sum_sse_order(__m256i *sse_hx16, __m256i *sum_hx16,
+ unsigned int *tot_sse, int *tot_sum) {
+ // s00 s01 s10 s11 s20 s21 s30 s31
+ const __m256i sse_results = _mm256_hadd_epi32(sse_hx16[0], sse_hx16[1]);
+ // d00 d01 d02 d03 | d10 d11 d12 d13 | d20 d21 d22 d23 | d30 d31 d32 d33
+ const __m256i sum_result_r0 = _mm256_hadd_epi16(sum_hx16[0], sum_hx16[1]);
+ // d00 d01 d10 d11 | d00 d02 d10 d11 | d20 d21 d30 d31 | d20 d21 d30 d31
+ const __m256i sum_result_1 = _mm256_hadd_epi16(sum_result_r0, sum_result_r0);
+ // d00 d01 d10 d11 d20 d21 d30 d31 | X
+ const __m256i sum_result_3 = _mm256_permute4x64_epi64(sum_result_1, 0x08);
+ // d00 d01 d10 d11 d20 d21 d30 d31
+ const __m256i sum_results =
+ _mm256_cvtepi16_epi32(_mm256_castsi256_si128(sum_result_3));
+
+ // Add sum & sse registers appropriately to get total sum & sse separately.
+ // s0 s1 d0 d1 s2 s3 d2 d3
+ const __m256i sum_sse_add = _mm256_hadd_epi32(sse_results, sum_results);
+ // s0 s1 s2 s3 d0 d1 d2 d3
+ const __m256i sum_sse_order_add = _mm256_permute4x64_epi64(sum_sse_add, 0xd8);
+ // s0+s1 s2+s3 s0+s1 s2+s3 d0+d1 d2+d3 d0+d1 d2+d3
+ const __m256i sum_sse_order_add_1 =
+ _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+ // s0 x x x | d0 x x x
+ const __m256i sum_sse_order_add_final =
+ _mm256_hadd_epi32(sum_sse_order_add_1, sum_sse_order_add_1);
+ // s0
+ const uint32_t first_value =
+ (uint32_t)_mm256_extract_epi32(sum_sse_order_add_final, 0);
+ *tot_sse += first_value;
+ // d0
+ const int second_value = _mm256_extract_epi32(sum_sse_order_add_final, 4);
+ *tot_sum += second_value;
+ return sum_sse_order_add;
+}
+
+static INLINE void get_var_sse_sum_8x8_quad_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref,
+ const int ref_stride, const int h, uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum, uint32_t *var8x8) {
+ assert(h <= 128); // May overflow for larger height.
+ __m256i sse_8x16[2], sum_8x16[2];
+ sum_8x16[0] = _mm256_setzero_si256();
+ sse_8x16[0] = _mm256_setzero_si256();
+ sum_8x16[1] = sum_8x16[0];
+ sse_8x16[1] = sse_8x16[0];
+ const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+ for (int i = 0; i < h; i++) {
+ // Process 8x32 block of one row.
+ calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_8x16, sum_8x16);
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ const __m256i sum_sse_order_add =
+ calc_sum_sse_order(sse_8x16, sum_8x16, tot_sse, tot_sum);
+
+ // s0 s1 s2 s3
+ _mm_storeu_si128((__m128i *)sse8x8,
+ _mm256_castsi256_si128(sum_sse_order_add));
+ // d0 d1 d2 d3
+ const __m128i sum_temp8x8 = _mm256_extractf128_si256(sum_sse_order_add, 1);
+ _mm_storeu_si128((__m128i *)sum8x8, sum_temp8x8);
+
+ // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+ const __m128i mull_results =
+ _mm_srli_epi32(_mm_mullo_epi32(sum_temp8x8, sum_temp8x8), 6);
+ // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+ const __m128i variance_8x8 =
+ _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add), mull_results);
+ // v0 v1 v2 v3
+ _mm_storeu_si128((__m128i *)var8x8, variance_8x8);
+}
+
+static INLINE void get_var_sse_sum_16x16_dual_avx2(
+ const uint8_t *src, int src_stride, const uint8_t *ref,
+ const int ref_stride, const int h, uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum, uint32_t *var16x16) {
+ assert(h <= 128); // May overflow for larger height.
+ __m256i sse_16x16[2], sum_16x16[2];
+ sum_16x16[0] = _mm256_setzero_si256();
+ sse_16x16[0] = _mm256_setzero_si256();
+ sum_16x16[1] = sum_16x16[0];
+ sse_16x16[1] = sse_16x16[0];
+ const __m256i set_one_minusone = _mm256_set1_epi16((short)0xff01);
+
+ for (int i = 0; i < h; i++) {
+ // Process 16x32 block of one row.
+ calc_sum_sse_wd32_avx2(src, ref, set_one_minusone, sse_16x16, sum_16x16);
+ src += src_stride;
+ ref += ref_stride;
+ }
+
+ const __m256i sum_sse_order_add =
+ calc_sum_sse_order(sse_16x16, sum_16x16, tot_sse, tot_sum);
+
+ const __m256i sum_sse_order_add_1 =
+ _mm256_hadd_epi32(sum_sse_order_add, sum_sse_order_add);
+
+ // s0+s1 s2+s3 x x
+ _mm_storel_epi64((__m128i *)sse16x16,
+ _mm256_castsi256_si128(sum_sse_order_add_1));
+
+ // d0+d1 d2+d3 x x
+ const __m128i sum_temp16x16 =
+ _mm256_extractf128_si256(sum_sse_order_add_1, 1);
+
+ // (d0xd0 >> 6)=f0 (d1xd1 >> 6)=f1 (d2xd2 >> 6)=f2 (d3xd3 >> 6)=f3
+ const __m128i mull_results =
+ _mm_srli_epi32(_mm_mullo_epi32(sum_temp16x16, sum_temp16x16), 8);
+
+ // s0-f0=v0 s1-f1=v1 s2-f2=v2 s3-f3=v3
+ const __m128i variance_16x16 =
+ _mm_sub_epi32(_mm256_castsi256_si128(sum_sse_order_add_1), mull_results);
+
+ // v0 v1 v2 v3
+ _mm_storel_epi64((__m128i *)var16x16, variance_16x16);
+}
+
+void aom_get_var_sse_sum_8x8_quad_avx2(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ get_var_sse_sum_8x8_quad_avx2(src_ptr, source_stride, ref_ptr, ref_stride, 8,
+ sse8x8, sum8x8, tot_sse, tot_sum, var8x8);
+}
+
+void aom_get_var_sse_sum_16x16_dual_avx2(const uint8_t *src_ptr,
+ int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ get_var_sse_sum_16x16_dual_avx2(src_ptr, source_stride, ref_ptr, ref_stride,
+ 16, sse16x16, tot_sse, tot_sum, var16x16);
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_avx2.c b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
new file mode 100644
index 0000000000..9e9e70ea01
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_avx2.c
@@ -0,0 +1,924 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <immintrin.h> // AVX2
+
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_ports/mem.h"
+
+/* clang-format off */
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+ 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+/* clang-format on */
+
+#define FILTER_SRC(filter) \
+ /* filter the source */ \
+ exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+ exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+ exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+ \
+ /* divide source by 16 */ \
+ exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+ exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+ exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+ exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+ /* load source and destination */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+ src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+ /* expand each byte to 2 bytes */ \
+ exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+ exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+ /* source - dest */ \
+ exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+ exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+ /* caculate sum */ \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+ exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+ sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+ exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+ /* calculate sse */ \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+ sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+ res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+ sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+ sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+ \
+ sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+ \
+ sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+ sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+ sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+ sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+ _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ /* load source and destination of 2 rows and insert*/ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+ _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \
+ MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT \
+ /* load source and another source from next row */ \
+ src_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
+ /* load source and next row source from 1 byte onwards */ \
+ src_next_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+ _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT \
+ dst_reg = _mm256_inserti128_si256( \
+ _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+ _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter) \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \
+ __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \
+ __m128i filter_128bit = _mm256_castsi256_si128(filter); \
+ __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter) \
+ /* filter the source */ \
+ src_lo = _mm_maddubs_epi16(src_lo, filter); \
+ src_hi = _mm_maddubs_epi16(src_hi, filter); \
+ \
+ /* add 8 to source */ \
+ src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+ src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+ \
+ /* divide source by 16 */ \
+ src_lo = _mm_srai_epi16(src_lo, 4); \
+ src_hi = _mm_srai_epi16(src_hi, 4);
+
+// TODO(chiyotsai@google.com): These variance functions are macro-fied so we
+// don't have to manually optimize the individual for-loops. We could save some
+// binary size by optimizing the loops more carefully without duplicating the
+// codes with a macro.
+#define MAKE_SUB_PIXEL_VAR_32XH(height, log2height) \
+ static AOM_INLINE int aom_sub_pixel_variance32x##height##_imp_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, src_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg; \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* average between previous average to current average */ \
+ src_avg = _mm256_avg_epu8(src_avg, src_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ /* save current source average */ \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg, src_avg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ MERGE_WITH_SRC(src_avg, src_reg) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src_pack = src_reg; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ } else { \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* merge previous pack to current pack source */ \
+ MERGE_WITH_SRC(src_pack, src_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return sum; \
+ } \
+ unsigned int aom_sub_pixel_variance32x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ const int sum = aom_sub_pixel_variance32x##height##_imp_avx2( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##x##hf##_imp_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_VAR_AVX2(64, 32, 32, 32, 6, 5)
+
+#define MAKE_SUB_PIXEL_VAR_16XH(height, log2height) \
+ unsigned int aom_sub_pixel_variance16x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse) { \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 0 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ AVG_NEXT_SRC_INSERT(src_reg, src_stride) \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ MERGE_NEXT_SRC_INSERT(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ /* average between current and next stride source */ \
+ src_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg, src_temp; \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \
+ src_temp = _mm256_avg_epu8(src_avg, src_temp); \
+ LOAD_DST_INSERT \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_temp, zero_reg) \
+ /* save current source average */ \
+ src_avg = src_next_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride << 1; \
+ src += src_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ LOAD_DST_INSERT \
+ src_avg = _mm256_avg_epu8(src_avg, src_next_reg); \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } else { \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ __m256i filter, pw8, src_next_reg, src_avg, src_temp; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_avg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg); \
+ src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21); \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_avg, src_temp) \
+ /* save current source average */ \
+ src_avg = src_next_reg; \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride << 1; \
+ src += src_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
+ __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+ src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_avg, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_avg, src_next_reg) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i += 2) { \
+ LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
+ MERGE_NEXT_SRC_INSERT(src_reg, 1) \
+ FILTER_SRC(filter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += (src_stride << 1); \
+ dst += (dst_stride << 1); \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src_pack = src_reg; \
+ src += src_stride << 1; \
+ dst += dst_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ LOAD_SRC_MERGE_128BIT(filter) \
+ LOAD_DST_INSERT \
+ FILTER_SRC_128BIT(filter_128bit) \
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_next_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } else { \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load and insert source and next row source */ \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src += src_stride << 1; \
+ for (i = 0; i < height - 2; i += 2) { \
+ LOAD_SRC_NEXT_BYTE_INSERT \
+ LOAD_DST_INSERT \
+ MERGE_WITH_SRC(src_reg, src_next_reg) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21); \
+ /* average between previous pack to the current */ \
+ MERGE_WITH_SRC(src_pack, src_next_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride << 1; \
+ dst += dst_stride << 1; \
+ } \
+ /* last 2 rows processing happens here */ \
+ LOAD_SRC_MERGE_128BIT(xfilter) \
+ LOAD_DST_INSERT \
+ FILTER_SRC_128BIT(filter_128bit) \
+ src_reg_0 = _mm_packus_epi16(src_lo, src_hi); \
+ src_next_reg = _mm256_permute2x128_si256( \
+ src_pack, _mm256_castsi128_si256(src_reg_0), 0x21); \
+ MERGE_WITH_SRC(src_pack, src_next_reg) \
+ FILTER_SRC(yfilter) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (4 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_VAR_16XH(32, 5)
+MAKE_SUB_PIXEL_VAR_16XH(16, 4)
+MAKE_SUB_PIXEL_VAR_16XH(8, 3)
+#if !CONFIG_REALTIME_ONLY
+MAKE_SUB_PIXEL_VAR_16XH(64, 6)
+MAKE_SUB_PIXEL_VAR_16XH(4, 2)
+#endif
+
+#define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \
+ int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \
+ unsigned int *sse) { \
+ __m256i sec_reg; \
+ __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi; \
+ __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi; \
+ __m256i zero_reg; \
+ int i, sum; \
+ sum_reg = _mm256_setzero_si256(); \
+ sse_reg = _mm256_setzero_si256(); \
+ zero_reg = _mm256_setzero_si256(); \
+ \
+ /* x_offset = 0 and y_offset = 0 */ \
+ if (x_offset == 0) { \
+ if (y_offset == 0) { \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, src_stride) \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expend each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 0 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg; \
+ \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, src_stride) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = 4 and y_offset = 0 */ \
+ } else if (x_offset == 4) { \
+ if (y_offset == 0) { \
+ __m256i src_next_reg; \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ sec += sec_stride; \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i src_next_reg, src_avg; \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ /* average between previous average to current average */ \
+ src_avg = _mm256_avg_epu8(src_avg, src_reg); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \
+ sec += sec_stride; \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = 4 and y_offset = bilin interpolation */ \
+ } else { \
+ __m256i filter, pw8, src_next_reg, src_avg; \
+ y_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ AVG_NEXT_SRC(src_reg, 1) \
+ for (i = 0; i < height; i++) { \
+ /* save current source average */ \
+ src_avg = src_reg; \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ AVG_NEXT_SRC(src_reg, 1) \
+ MERGE_WITH_SRC(src_avg, src_reg) \
+ FILTER_SRC(filter) \
+ src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_avg = _mm256_avg_epu8(src_avg, sec_reg); \
+ /* expand each byte to 2 bytes */ \
+ MERGE_WITH_SRC(src_avg, zero_reg) \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 0 */ \
+ } else { \
+ if (y_offset == 0) { \
+ __m256i filter, pw8, src_next_reg; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ for (i = 0; i < height; i++) { \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_reg = _mm256_avg_epu8(src_reg, sec_reg); \
+ MERGE_WITH_SRC(src_reg, zero_reg) \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ src += src_stride; \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = 4 */ \
+ } else if (y_offset == 4) { \
+ __m256i filter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ filter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(filter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* average between previous pack to the current */ \
+ src_pack = _mm256_avg_epu8(src_pack, src_reg); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \
+ sec += sec_stride; \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ src_pack = src_reg; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ /* x_offset = bilin interpolation and y_offset = bilin interpolation \
+ */ \
+ } else { \
+ __m256i xfilter, yfilter, pw8, src_next_reg, src_pack; \
+ x_offset <<= 5; \
+ xfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + x_offset)); \
+ y_offset <<= 5; \
+ yfilter = _mm256_load_si256( \
+ (__m256i const *)(bilinear_filters_avx2 + y_offset)); \
+ pw8 = _mm256_set1_epi16(8); \
+ /* load source and another source starting from the next */ \
+ /* following byte */ \
+ src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ \
+ FILTER_SRC(xfilter) \
+ /* convert each 16 bit to 8 bit to each low and high lane source */ \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ for (i = 0; i < height; i++) { \
+ src += src_stride; \
+ LOAD_SRC_DST \
+ MERGE_NEXT_SRC(src_reg, 1) \
+ FILTER_SRC(xfilter) \
+ src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ /* merge previous pack to current pack source */ \
+ MERGE_WITH_SRC(src_pack, src_reg) \
+ /* filter the source */ \
+ FILTER_SRC(yfilter) \
+ src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi); \
+ sec_reg = _mm256_loadu_si256((__m256i const *)(sec)); \
+ src_pack = _mm256_avg_epu8(src_pack, sec_reg); \
+ MERGE_WITH_SRC(src_pack, zero_reg) \
+ src_pack = src_reg; \
+ sec += sec_stride; \
+ CALC_SUM_SSE_INSIDE_LOOP \
+ dst += dst_stride; \
+ } \
+ } \
+ } \
+ CALC_SUM_AND_SSE \
+ _mm256_zeroupper(); \
+ return sum; \
+ } \
+ unsigned int aom_sub_pixel_avg_variance32x##height##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse, \
+ const uint8_t *sec_ptr) { \
+ const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \
+ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \
+ sse); \
+ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \
+ }
+
+MAKE_SUB_PIXEL_AVG_VAR_32XH(64, 6)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(32, 5)
+MAKE_SUB_PIXEL_AVG_VAR_32XH(16, 4)
+
+#define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, hf, wlog2, hlog2) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, &sse2); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(((int64_t)se * se) >> (wlog2 + hlog2)); \
+ }
+
+// Note: hf = AOMMIN(h, 64) to avoid overflow in helper by capping height.
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 128, 32, 64, 7, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(128, 64, 32, 64, 7, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 128, 32, 64, 6, 7)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 64, 32, 64, 6, 6)
+AOM_SUB_PIXEL_AVG_VAR_AVX2(64, 32, 32, 32, 6, 5)
diff --git a/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
new file mode 100644
index 0000000000..699002195b
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_impl_ssse3.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/x86/synonyms.h"
+
+void aom_var_filter_block2d_bil_first_pass_ssse3(
+ const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ // Note: filter[0], filter[1] could be {128, 0}, where 128 will overflow
+ // in computation using _mm_maddubs_epi16.
+ // Change {128, 0} to {64, 0} and reduce FILTER_BITS by 1 to avoid overflow.
+ const int16_t round = (1 << (FILTER_BITS - 1)) >> 1;
+ const __m128i r = _mm_set1_epi16(round);
+ const int8_t f0 = (int8_t)(filter[0] >> 1);
+ const int8_t f1 = (int8_t)(filter[1] >> 1);
+ const __m128i filters = _mm_setr_epi8(f0, f1, f0, f1, f0, f1, f0, f1, f0, f1,
+ f0, f1, f0, f1, f0, f1);
+ unsigned int i, j;
+ (void)pixel_step;
+
+ if (output_width >= 8) {
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 8) {
+ // load source
+ __m128i source_low = xx_loadl_64(a);
+ __m128i source_hi = xx_loadl_64(a + 1);
+
+ // unpack to:
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source = _mm_unpacklo_epi8(source_low, source_hi);
+
+ // b[i] = a[i] * filter[0] + a[i + 1] * filter[1]
+ __m128i res = _mm_maddubs_epi16(source, filters);
+
+ // round
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storeu_128(b, res);
+
+ a += 8;
+ b += 8;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+ } else {
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8);
+ for (i = 0; i < output_height; ++i) {
+ // load source, only first 5 values are meaningful:
+ // { a[0], a[1], a[2], a[3], a[4], xxxx }
+ __m128i source = xx_loadl_64(a);
+
+ // shuffle, up to the first 8 are useful
+ // { a[0], a[1], a[1], a[2], a[2], a[3], a[3], a[4],
+ // a[4], a[5], a[5], a[6], a[6], a[7], a[7], a[8] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ __m128i res = _mm_maddubs_epi16(source_shuffle, filters);
+ res = _mm_srai_epi16(_mm_add_epi16(res, r), FILTER_BITS - 1);
+
+ xx_storel_64(b, res);
+
+ a += src_pixels_per_line;
+ b += output_width;
+ }
+ }
+}
+
+void aom_var_filter_block2d_bil_second_pass_ssse3(
+ const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line,
+ unsigned int pixel_step, unsigned int output_height,
+ unsigned int output_width, const uint8_t *filter) {
+ const int16_t round = (1 << FILTER_BITS) >> 1;
+ const __m128i r = _mm_set1_epi32(round);
+ const __m128i filters =
+ _mm_setr_epi16(filter[0], filter[1], filter[0], filter[1], filter[0],
+ filter[1], filter[0], filter[1]);
+ const __m128i shuffle_mask =
+ _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+ const __m128i mask =
+ _mm_setr_epi8(0, 4, 8, 12, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1);
+ unsigned int i, j;
+
+ for (i = 0; i < output_height; ++i) {
+ for (j = 0; j < output_width; j += 4) {
+ // load source as:
+ // { a[0], a[1], a[2], a[3], a[w], a[w+1], a[w+2], a[w+3] }
+ __m128i source1 = xx_loadl_64(a);
+ __m128i source2 = xx_loadl_64(a + pixel_step);
+ __m128i source = _mm_unpacklo_epi64(source1, source2);
+
+ // shuffle source to:
+ // { a[0], a[w], a[1], a[w+1], a[2], a[w+2], a[3], a[w+3] }
+ __m128i source_shuffle = _mm_shuffle_epi8(source, shuffle_mask);
+
+ // b[i] = a[i] * filter[0] + a[w + i] * filter[1]
+ __m128i res = _mm_madd_epi16(source_shuffle, filters);
+
+ // round
+ res = _mm_srai_epi32(_mm_add_epi32(res, r), FILTER_BITS);
+
+ // shuffle to get each lower 8 bit of every 32 bit
+ res = _mm_shuffle_epi8(res, mask);
+
+ xx_storel_32(b, res);
+
+ a += 4;
+ b += 4;
+ }
+
+ a += src_pixels_per_line - output_width;
+ }
+}
diff --git a/third_party/aom/aom_dsp/x86/variance_sse2.c b/third_party/aom/aom_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000000..faec9cf73d
--- /dev/null
+++ b/third_party/aom/aom_dsp/x86/variance_sse2.c
@@ -0,0 +1,802 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+
+#include "config/aom_config.h"
+#include "config/aom_dsp_rtcd.h"
+
+#include "aom_dsp/blend.h"
+#include "aom_dsp/x86/mem_sse2.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "aom_ports/mem.h"
+
+unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
+ __m128i vsum = _mm_setzero_si128();
+ int i;
+
+ for (i = 0; i < 32; ++i) {
+ const __m128i v = xx_loadu_128(src);
+ vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+ src += 8;
+ }
+
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+ return (unsigned int)_mm_cvtsi128_si32(vsum);
+}
+
+static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
+ const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+ const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
+ return _mm_unpacklo_epi8(_mm_unpacklo_epi32(p0, p1), _mm_setzero_si128());
+}
+
+static INLINE __m128i load8_8to16_sse2(const uint8_t *const p) {
+ const __m128i p0 = _mm_loadl_epi64((const __m128i *)p);
+ return _mm_unpacklo_epi8(p0, _mm_setzero_si128());
+}
+
+static INLINE void load16_8to16_sse2(const uint8_t *const p, __m128i *out) {
+ const __m128i p0 = _mm_loadu_si128((const __m128i *)p);
+ out[0] = _mm_unpacklo_epi8(p0, _mm_setzero_si128()); // lower 8 values
+ out[1] = _mm_unpackhi_epi8(p0, _mm_setzero_si128()); // upper 8 values
+}
+
+// Accumulate 4 32bit numbers in val to 1 32bit number
+static INLINE unsigned int add32x4_sse2(__m128i val) {
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 8));
+ val = _mm_add_epi32(val, _mm_srli_si128(val, 4));
+ return (unsigned int)_mm_cvtsi128_si32(val);
+}
+
+// Accumulate 8 16bit in sum to 4 32bit number
+static INLINE __m128i sum_to_32bit_sse2(const __m128i sum) {
+ const __m128i sum_lo = _mm_srai_epi32(_mm_unpacklo_epi16(sum, sum), 16);
+ const __m128i sum_hi = _mm_srai_epi32(_mm_unpackhi_epi16(sum, sum), 16);
+ return _mm_add_epi32(sum_lo, sum_hi);
+}
+
+static INLINE void variance_kernel_sse2(const __m128i src, const __m128i ref,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i diff = _mm_sub_epi16(src, ref);
+ *sse = _mm_add_epi32(*sse, _mm_madd_epi16(diff, diff));
+ *sum = _mm_add_epi16(*sum, diff);
+}
+
+// Can handle 128 pixels' diff sum (such as 8x16 or 16x8)
+// Slightly faster than variance_final_256_pel_sse2()
+// diff sum of 128 pixels can still fit in 16bit integer
+static INLINE void variance_final_128_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+}
+
+// Can handle 256 pixels' diff sum (such as 16x16)
+static INLINE void variance_final_256_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+ *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+ *sum += (int16_t)_mm_extract_epi16(vsum, 1);
+}
+
+// Can handle 512 pixels' diff sum (such as 16x32 or 32x16)
+static INLINE void variance_final_512_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+ vsum = _mm_unpacklo_epi16(vsum, vsum);
+ vsum = _mm_srai_epi32(vsum, 16);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+// Can handle 1024 pixels' diff sum (such as 32x32)
+static INLINE void variance_final_1024_pel_sse2(__m128i vsse, __m128i vsum,
+ unsigned int *const sse,
+ int *const sum) {
+ *sse = add32x4_sse2(vsse);
+
+ vsum = sum_to_32bit_sse2(vsum);
+ *sum = (int)add32x4_sse2(vsum);
+}
+
+static INLINE void variance4_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 256); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i += 2) {
+ const __m128i s = load4x2_sse2(src, src_stride);
+ const __m128i r = load4x2_sse2(ref, ref_stride);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src += 2 * src_stride;
+ ref += 2 * ref_stride;
+ }
+}
+
+static INLINE void variance8_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 128); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+ *sse = _mm_setzero_si128();
+ for (int i = 0; i < h; i++) {
+ const __m128i s = load8_8to16_sse2(src);
+ const __m128i r = load8_8to16_sse2(ref);
+
+ variance_kernel_sse2(s, r, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance16_kernel_sse2(const uint8_t *const src,
+ const uint8_t *const ref,
+ __m128i *const sse,
+ __m128i *const sum) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i s = _mm_loadu_si128((const __m128i *)src);
+ const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+ const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+ const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+ const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+ const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+
+ variance_kernel_sse2(src0, ref0, sse, sum);
+ variance_kernel_sse2(src1, ref1, sse, sum);
+}
+
+static INLINE void variance16_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 64); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src, ref, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance32_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 32); // May overflow for larger height.
+ // Don't initialize sse here since it's an accumulation.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+ variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance64_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 16); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ variance16_kernel_sse2(src + 0, ref + 0, sse, sum);
+ variance16_kernel_sse2(src + 16, ref + 16, sse, sum);
+ variance16_kernel_sse2(src + 32, ref + 32, sse, sum);
+ variance16_kernel_sse2(src + 48, ref + 48, sse, sum);
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+static INLINE void variance128_sse2(const uint8_t *src, const int src_stride,
+ const uint8_t *ref, const int ref_stride,
+ const int h, __m128i *const sse,
+ __m128i *const sum) {
+ assert(h <= 8); // May overflow for larger height.
+ *sum = _mm_setzero_si128();
+
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < 4; ++j) {
+ const int offset0 = j << 5;
+ const int offset1 = offset0 + 16;
+ variance16_kernel_sse2(src + offset0, ref + offset0, sse, sum);
+ variance16_kernel_sse2(src + offset1, ref + offset1, sse, sum);
+ }
+ src += src_stride;
+ ref += ref_stride;
+ }
+}
+
+void aom_get_var_sse_sum_8x8_quad_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse8x8, int *sum8x8,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var8x8) {
+ // Loop over 4 8x8 blocks. Process one 8x32 block.
+ for (int k = 0; k < 4; k++) {
+ const uint8_t *src = src_ptr;
+ const uint8_t *ref = ref_ptr;
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ for (int i = 0; i < 8; i++) {
+ const __m128i s = load8_8to16_sse2(src + (k * 8));
+ const __m128i r = load8_8to16_sse2(ref + (k * 8));
+ const __m128i diff = _mm_sub_epi16(s, r);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff, diff));
+ vsum = _mm_add_epi16(vsum, diff);
+
+ src += src_stride;
+ ref += ref_stride;
+ }
+ variance_final_128_pel_sse2(vsse, vsum, &sse8x8[k], &sum8x8[k]);
+ }
+
+ // Calculate variance at 8x8 level and total sse, sum of 8x32 block.
+ *tot_sse += sse8x8[0] + sse8x8[1] + sse8x8[2] + sse8x8[3];
+ *tot_sum += sum8x8[0] + sum8x8[1] + sum8x8[2] + sum8x8[3];
+ for (int i = 0; i < 4; i++)
+ var8x8[i] = sse8x8[i] - (uint32_t)(((int64_t)sum8x8[i] * sum8x8[i]) >> 6);
+}
+
+void aom_get_var_sse_sum_16x16_dual_sse2(const uint8_t *src_ptr, int src_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ uint32_t *sse16x16,
+ unsigned int *tot_sse, int *tot_sum,
+ uint32_t *var16x16) {
+ int sum16x16[2] = { 0 };
+ // Loop over 2 16x16 blocks. Process one 16x32 block.
+ for (int k = 0; k < 2; k++) {
+ const uint8_t *src = src_ptr;
+ const uint8_t *ref = ref_ptr;
+ __m128i vsum = _mm_setzero_si128();
+ __m128i vsse = _mm_setzero_si128();
+ for (int i = 0; i < 16; i++) {
+ __m128i s[2];
+ __m128i r[2];
+ load16_8to16_sse2(src + (k * 16), s);
+ load16_8to16_sse2(ref + (k * 16), r);
+ const __m128i diff0 = _mm_sub_epi16(s[0], r[0]);
+ const __m128i diff1 = _mm_sub_epi16(s[1], r[1]);
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+ vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+ vsum = _mm_add_epi16(vsum, _mm_add_epi16(diff0, diff1));
+ src += src_stride;
+ ref += ref_stride;
+ }
+ variance_final_256_pel_sse2(vsse, vsum, &sse16x16[k], &sum16x16[k]);
+ }
+
+ // Calculate variance at 16x16 level and total sse, sum of 16x32 block.
+ *tot_sse += sse16x16[0] + sse16x16[1];
+ *tot_sum += sum16x16[0] + sum16x16[1];
+ for (int i = 0; i < 2; i++)
+ var16x16[i] =
+ sse16x16[i] - (uint32_t)(((int64_t)sum16x16[i] * sum16x16[i]) >> 8);
+}
+
+#define AOM_VAR_NO_LOOP_SSE2(bw, bh, bits, max_pixels) \
+ unsigned int aom_variance##bw##x##bh##_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m128i vsse = _mm_setzero_si128(); \
+ __m128i vsum; \
+ int sum = 0; \
+ variance##bw##_sse2(src, src_stride, ref, ref_stride, bh, &vsse, &vsum); \
+ variance_final_##max_pixels##_pel_sse2(vsse, vsum, sse, &sum); \
+ assert(sum <= 255 * bw * bh); \
+ assert(sum >= -255 * bw * bh); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_NO_LOOP_SSE2(4, 4, 4, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 8, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(4, 16, 6, 128)
+
+AOM_VAR_NO_LOOP_SSE2(8, 4, 5, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 8, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 16, 7, 128)
+
+AOM_VAR_NO_LOOP_SSE2(16, 8, 7, 128)
+AOM_VAR_NO_LOOP_SSE2(16, 16, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 32, 9, 512)
+
+AOM_VAR_NO_LOOP_SSE2(32, 8, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(32, 16, 9, 512)
+AOM_VAR_NO_LOOP_SSE2(32, 32, 10, 1024)
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(16, 4, 6, 128)
+AOM_VAR_NO_LOOP_SSE2(8, 32, 8, 256)
+AOM_VAR_NO_LOOP_SSE2(16, 64, 10, 1024)
+#endif
+
+#define AOM_VAR_LOOP_SSE2(bw, bh, bits, uh) \
+ unsigned int aom_variance##bw##x##bh##_sse2( \
+ const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \
+ unsigned int *sse) { \
+ __m128i vsse = _mm_setzero_si128(); \
+ __m128i vsum = _mm_setzero_si128(); \
+ for (int i = 0; i < (bh / uh); ++i) { \
+ __m128i vsum16; \
+ variance##bw##_sse2(src, src_stride, ref, ref_stride, uh, &vsse, \
+ &vsum16); \
+ vsum = _mm_add_epi32(vsum, sum_to_32bit_sse2(vsum16)); \
+ src += (src_stride * uh); \
+ ref += (ref_stride * uh); \
+ } \
+ *sse = add32x4_sse2(vsse); \
+ int sum = (int)add32x4_sse2(vsum); \
+ assert(sum <= 255 * bw * bh); \
+ assert(sum >= -255 * bw * bh); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) >> bits); \
+ }
+
+AOM_VAR_LOOP_SSE2(32, 64, 11, 32) // 32x32 * ( 64/32 )
+
+AOM_VAR_LOOP_SSE2(64, 32, 11, 16) // 64x16 * ( 32/16 )
+AOM_VAR_LOOP_SSE2(64, 64, 12, 16) // 64x16 * ( 64/16 )
+AOM_VAR_LOOP_SSE2(64, 128, 13, 16) // 64x16 * ( 128/16 )
+
+AOM_VAR_LOOP_SSE2(128, 64, 13, 8) // 128x8 * ( 64/8 )
+AOM_VAR_LOOP_SSE2(128, 128, 14, 8) // 128x8 * ( 128/8 )
+
+#if !CONFIG_REALTIME_ONLY
+AOM_VAR_NO_LOOP_SSE2(64, 16, 10, 1024)
+#endif
+
+unsigned int aom_mse8x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse8x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x8_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+unsigned int aom_mse16x16_sse2(const uint8_t *src, int src_stride,
+ const uint8_t *ref, int ref_stride,
+ unsigned int *sse) {
+ aom_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+ return *sse;
+}
+
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+ int aom_sub_pixel_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
+ void *unused0, void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
+ &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+ int aom_sub_pixel_avg_variance##w##xh_##opt( \
+ const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec, \
+ ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \
+ void *unused)
+#define DECLS(opt) \
+ DECL(4, opt); \
+ DECL(8, opt); \
+ DECL(16, opt)
+
+DECLS(sse2);
+DECLS(ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast) \
+ unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt( \
+ const uint8_t *src, int src_stride, int x_offset, int y_offset, \
+ const uint8_t *dst, int dst_stride, unsigned int *sse_ptr, \
+ const uint8_t *sec) { \
+ /*Avoid overflow in helper by capping height.*/ \
+ const int hf = AOMMIN(h, 64); \
+ unsigned int sse = 0; \
+ int se = 0; \
+ for (int i = 0; i < (w / wf); ++i) { \
+ const uint8_t *src_ptr = src; \
+ const uint8_t *dst_ptr = dst; \
+ const uint8_t *sec_ptr = sec; \
+ for (int j = 0; j < (h / hf); ++j) { \
+ unsigned int sse2; \
+ const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt( \
+ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \
+ sec_ptr, w, hf, &sse2, NULL, NULL); \
+ dst_ptr += hf * dst_stride; \
+ src_ptr += hf * src_stride; \
+ sec_ptr += hf * w; \
+ se += se2; \
+ sse += sse2; \
+ } \
+ src += wf; \
+ dst += wf; \
+ sec += wf; \
+ } \
+ *sse_ptr = sse; \
+ return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
+ }
+
+#if !CONFIG_REALTIME_ONLY
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t)) \
+ FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t)) \
+ FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t)) \
+ FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t)) \
+ FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
+#else
+#define FNS(opt) \
+ FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
+ FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t)) \
+ FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t)) \
+ FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t)) \
+ FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t)) \
+ FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t)) \
+ FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t)) \
+ FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t)) \
+ FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t)) \
+ FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t)) \
+ FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t)) \
+ FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t)) \
+ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
+#endif
+
+FNS(sse2)
+FNS(ssse3)
+
+#undef FNS
+#undef FN
+
+static INLINE __m128i highbd_comp_mask_pred_line_sse2(const __m128i s0,
+ const __m128i s1,
+ const __m128i a) {
+ const __m128i alpha_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
+ const __m128i round_const =
+ _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
+ const __m128i a_inv = _mm_sub_epi16(alpha_max, a);
+
+ const __m128i s_lo = _mm_unpacklo_epi16(s0, s1);
+ const __m128i a_lo = _mm_unpacklo_epi16(a, a_inv);
+ const __m128i pred_lo = _mm_madd_epi16(s_lo, a_lo);
+ const __m128i pred_l = _mm_srai_epi32(_mm_add_epi32(pred_lo, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i s_hi = _mm_unpackhi_epi16(s0, s1);
+ const __m128i a_hi = _mm_unpackhi_epi16(a, a_inv);
+ const __m128i pred_hi = _mm_madd_epi16(s_hi, a_hi);
+ const __m128i pred_h = _mm_srai_epi32(_mm_add_epi32(pred_hi, round_const),
+ AOM_BLEND_A64_ROUND_BITS);
+
+ const __m128i comp = _mm_packs_epi32(pred_l, pred_h);
+
+ return comp;
+}
+
+void aom_highbd_comp_mask_pred_sse2(uint8_t *comp_pred8, const uint8_t *pred8,
+ int width, int height, const uint8_t *ref8,
+ int ref_stride, const uint8_t *mask,
+ int mask_stride, int invert_mask) {
+ int i = 0;
+ uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
+ uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+ uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+ const uint16_t *src0 = invert_mask ? pred : ref;
+ const uint16_t *src1 = invert_mask ? ref : pred;
+ const int stride0 = invert_mask ? width : ref_stride;
+ const int stride1 = invert_mask ? ref_stride : width;
+ const __m128i zero = _mm_setzero_si128();
+
+ if (width == 8) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i m_8 = _mm_loadl_epi64((const __m128i *)mask);
+ const __m128i m_16 = _mm_unpacklo_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else if (width == 16) {
+ do {
+ const __m128i s0 = _mm_loadu_si128((const __m128i *)(src0));
+ const __m128i s2 = _mm_loadu_si128((const __m128i *)(src0 + 8));
+ const __m128i s1 = _mm_loadu_si128((const __m128i *)(src1));
+ const __m128i s3 = _mm_loadu_si128((const __m128i *)(src1 + 8));
+
+ const __m128i m_8 = _mm_loadu_si128((const __m128i *)mask);
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)comp_pred, comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8), comp1);
+
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ comp_pred += width;
+ i += 1;
+ } while (i < height);
+ } else {
+ do {
+ for (int x = 0; x < width; x += 32) {
+ for (int j = 0; j < 2; j++) {
+ const __m128i s0 =
+ _mm_loadu_si128((const __m128i *)(src0 + x + j * 16));
+ const __m128i s2 =
+ _mm_loadu_si128((const __m128i *)(src0 + x + 8 + j * 16));
+ const __m128i s1 =
+ _mm_loadu_si128((const __m128i *)(src1 + x + j * 16));
+ const __m128i s3 =
+ _mm_loadu_si128((const __m128i *)(src1 + x + 8 + j * 16));
+
+ const __m128i m_8 =
+ _mm_loadu_si128((const __m128i *)(mask + x + j * 16));
+ const __m128i m01_16 = _mm_unpacklo_epi8(m_8, zero);
+ const __m128i m23_16 = _mm_unpackhi_epi8(m_8, zero);
+
+ const __m128i comp = highbd_comp_mask_pred_line_sse2(s0, s1, m01_16);
+ const __m128i comp1 = highbd_comp_mask_pred_line_sse2(s2, s3, m23_16);
+
+ _mm_storeu_si128((__m128i *)(comp_pred + j * 16), comp);
+ _mm_storeu_si128((__m128i *)(comp_pred + 8 + j * 16), comp1);
+ }
+ comp_pred += 32;
+ }
+ src0 += stride0;
+ src1 += stride1;
+ mask += mask_stride;
+ i += 1;
+ } while (i < height);
+ }
+}
+
+uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst0_8x8, dst1_8x8, dst_16x8;
+ __m128i src0_16x4, src1_16x4, src_16x8;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
+ __m128i sub_result_16x8;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+ for (int i = 0; i < h; i += 2) {
+ dst0_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 0) * dstride]));
+ dst1_8x8 = _mm_cvtsi32_si128(*(int const *)(&dst[(i + 1) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(dst0_8x8, dst1_8x8), zeros);
+
+ src0_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride]));
+ src1_16x4 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride]));
+ src_16x8 = _mm_unpacklo_epi64(src0_16x4, src1_16x4);
+
+ sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+ }
+ const __m128i sum_64x1 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_64x1);
+ return sum;
+}
+
+uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int h) {
+ uint64_t sum = 0;
+ __m128i dst_8x8, dst_16x8;
+ __m128i src_16x8;
+ __m128i res0_32x4, res0_64x2, res1_64x2;
+ __m128i sub_result_16x8;
+ const __m128i zeros = _mm_setzero_si128();
+ __m128i square_result = _mm_setzero_si128();
+
+ for (int i = 0; i < h; i++) {
+ dst_8x8 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride]));
+ dst_16x8 = _mm_unpacklo_epi8(dst_8x8, zeros);
+
+ src_16x8 = _mm_loadu_si128((__m128i *)&src[i * sstride]);
+
+ sub_result_16x8 = _mm_sub_epi16(src_16x8, dst_16x8);
+
+ res0_32x4 = _mm_madd_epi16(sub_result_16x8, sub_result_16x8);
+
+ res0_64x2 = _mm_unpacklo_epi32(res0_32x4, zeros);
+ res1_64x2 = _mm_unpackhi_epi32(res0_32x4, zeros);
+
+ square_result =
+ _mm_add_epi64(square_result, _mm_add_epi64(res0_64x2, res1_64x2));
+ }
+ const __m128i sum_64x1 =
+ _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8));
+ xx_storel_64(&sum, sum_64x1);
+ return sum;
+}
+
+uint64_t aom_mse_wxh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int sstride, int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must satisfy");
+ switch (w) {
+ case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h);
+ case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h);
+ default: assert(0 && "unsupported width"); return -1;
+ }
+}
+
+uint64_t aom_mse_16xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src,
+ int w, int h) {
+ assert((w == 8 || w == 4) && (h == 8 || h == 4) &&
+ "w=8/4 and h=8/4 must be satisfied");
+ const int num_blks = 16 / w;
+ uint64_t sum = 0;
+ for (int i = 0; i < num_blks; i++) {
+ sum += aom_mse_wxh_16bit_sse2(dst, dstride, src, w, w, h);
+ dst += w;
+ src += (w * h);
+ }
+ return sum;
+}